import React from "react";
import "./../posts.css";
import "./fpga_9.css";
import arith from "./arith.jpg";
import block_1 from "./block_1.png";
import fsm_1 from "./fsm_1.png";
import logic_1 from "./logic_1.png";
import spi_byte from "./spi_byte.jpg";
import spi_byte_schem from "./spi_byte_schem.jpg";
import spi_msg from "./spi_msg.jpg";
import timing_1 from "./timing_1.png";
import {NavLink} from "react-router-dom";
import {Button} from "../../../../common";
import Popup from "reactjs-popup";
import AtomOneDark from "react-syntax-highlighter/src/styles/hljs/atom-one-dark";
import SyntaxHighlighter from "react-syntax-highlighter";

const arduino=`
    #include &lt;Arduino.h&gt;
    #include &lt;SPI.h&gt;
    
    #define LED 13
    #define nCS 10
    
    uint8_t temp, miso, i;
    
    void setup(){
        pinMode(LED, OUTPUT);
        pinMode(nCS, OUTPUT);
        Serial.begin(115200);
        while(!Serial);
        SPI.begin();
        Serial.println("SPI byte mode");
        digitalWrite(nCS, 1);
    }
    
    void loop(){
        for(i = 0; i < 256; ++i){
            digitalWrite(LED, 1);
            digitalWrite(nCS, 0);
            SPI.beginTransaction(SPISettings(1000000, MSBFIRST, SPI_MODE3));
            miso = SPI.transfer(i);
            SPI.endTransaction();
            digitalWrite(nCS, 1);
            digitalWrite(LED, 0);
            
            Serial.print("Sent Byte: ");
            Serial.print(i, HEX);
            Serial.print(", Received Byte: ");
            Serial.print(miso, HEX);
            Serial.print("\\r\\n");
            
            delay(1000);
        }
    }
`;

const sync_clk = `
    reg[2:0] SCLK_r;
    always @(posedge clk)
        SCLK_r <= {SCLK_r[1:0], sclk};
`;

const registers = `
reg[NR_RWREGS + NR_ROREGS - 1 : 0] [31:0] registers;
`;

const casez = `
always @(flag) begin
    {a0, a1, a2} = 3'b000;
    casez(flag)
        3'b1?? : a2 = 1'b1;
        3'b?1? : a1 = 1'b1;
        3'b??1 : a0 = 1'b1;
        default: {a2, a1, a0} = 3'b000;
    endcase
end
`;

const arithmetic = `
module arith(
    input [31:0] a,
    input [31:0] b,
    output [31:0] r,
    input [1:0] a_sel);
    
    wire a_neg_w = (a[31] == 1) ? 1 : 0;
    wire b_neg_w = (b[31] == 1) ? 1 : 0;

    reg [63:0] internal_reg;

    initial internal_reg = 64'd0;

    always @(a_sel) begin
        case(a_sel)
            2'b00: // SUM
                if(!a_neg_w && !b_neg_w)
                    internal_reg = {32'd0, a} + {32'd0, b};
                else if (!a_neg_w && b_neg_w)
                    internal_reg = {32'd0, a} + {{32{1'b1}}, b};
                else if (a_neg_w && !b_neg_w)
                    internal_reg = {{32{1'b1}}, a} + {32'd0, b};
                else if (a_neg_w && b_neg_w)
                    internal_reg = {{32{1'b1}}, a} + {{32{1'b1}}, b};
            2'b10: // MUL
                if(!a_neg_w && !b_neg_w)
                    internal_reg = {32'd0, a} * {32'd0, b};
                else if (!a_neg_w && b_neg_w)
                    internal_reg = {32'd0, a} * {{32{1'b1}}, b};
                else if (a_neg_w && !b_neg_w)
                    internal_reg = {{32{1'b1}}, a} * {32'd0, b};
                else if (a_neg_w && b_neg_w)
                    internal_reg = {{32{1'b1}}, a} * {{32{1'b1}}, b};
                default: internal_reg = 64'b0;
        endcase
    end

    assign r = internal_reg [31:0];

endmodule
`;

function FPGA_9(){
    return(
        <div className="em__post">
            <div className="em__post-title">
                <h1>FPGA as hardware accelerator</h1>
            </div>
            <div className="em__post-section">
                <h3>Aim of this post:</h3>
                <p>
                    In this tutorial, we will design a communication module to interface a hard processor.
                </p>
            </div>
            <div className="em__post-section">
                <h3>Prerequisites:</h3>
                <p>
                    Previous FPGA tutorials, some knowledge of serial communication protocols, Arduino IDE
                </p>
            </div>
            <div className="em__post-section">
                <h3>Tutorial structure:</h3>
                <p>
                    This one will be a lengthier tutorial, so here is a structure of sorts, with a small description.
                </p>
                <ol>
                    <li>Introduction into hardware accelerators - here we will look at the universal realization of
                        hardware acceleration by combining the hardcore processor with general or dedicated peripherals (theory).</li>
                    <li>Simple serial protocol implementation - here we will implement a soft SPI slave core which
                        accumulates the received data into an 8b RX register, and sends the bits of an 8b TX
                        register to the master (theory, simulation, implementation).</li>
                    <li>8b RAW to register interface - here we will create a finite state machine with data path,
                        to combine the 8b RX/TX values into command signals, R/W (read/write) registers, and RO
                        (Read-Only) registers (theory, simulation, implementation).</li>
                </ol>
            </div>

            <div className="em__post-section">
                <h3>Hardware accelerators</h3>
                <p>
                    Hardware acceleration is done by creating a specialized component which can perform an
                    algorithm more efficiently than a software in a general-purpose processor.
                    We can implement any algorithm in a general processor, while the hardware accelerators are up for only specific tasks.
                    Here we can consider a GPU/CPU interaction in case of computer graphics (e.g. ray-tracing), and teaching neural networks;
                    DSP cores for efficient FFT and filter implementations (or stand-alone FFT/Filter cores);
                    Cryptographic accelerators, Compressing accelerators etc.
                </p>
                <p>
                    A sorting algorithm would be a perfect example for an algorithm which runs more efficiently on a general-purpose processor, than specialized hardware.
                </p>
                <p>
                    Specialized connections enable the communication between processors and accelerators:
                    in the case of GPU, there is the PCI-ex x16 bus.
                    In the case of simple architectures, there is the Wish-Bone bus, AMBA (e.g. AXI4).
                    These are all parallel buses designed for optimal/fast data transactions.
                </p>
                <p>
                    We will pair an FPGA with an Arduino board, and implement a serial communication protocol
                    between them because of hardware constraints.
                    The trade-off of the simple bus is hardware complexity and speed reduction.
                    We could consider a few options if we would want to send data to two 32b wide data registers,
                    one 8b wide control register, and we would also want to read from a 32b wide data register:
                </p>
                <ul>
                    <li>A 32b wide data I/O line, and 2b wide address line (with CLK and RW) - read and write in one cycle.</li>
                    <li>A 16b wide data I/O line, and 3b wide address line (with CLK and RW) - read and write in two cycles.</li>
                    <li>An 8b wide data I/O line, and 4b wide address line (with CLK, and RW) - read and write in four cycles.</li>
                    <li>Separate 1b input, and output line (with CLK) - read and write in min. Eight cycles.</li>
                </ul>
                <p>
                    There could be other possibilities such as separating the I/O lines into In- and Out-lines,
                    or adding data/instr flag line etc. but even in these few examples, the speed/complexity ratio is evident.
                </p>
                <p>
                    A few words about the SPI master: we will use an Arduino Nano as the master.
                    I'm usually avoiding the Arduino environment as best as I can, because of reasons,
                    but in this case, an Arduino board will suffice.
                </p>
            </div>
            <div className="em__post-section">
                <h3>Simple serial protocol implementation</h3>
                <Popup trigger={<img className="img_block_1 clickable_img" src={block_1} alt="block_1" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={block_1} alt="block_1" />
                    )}
                </Popup>
                <p>
                    We have chosen the SPI serial communication since, we have only one master (and one slave in this case),
                    the communication speed can be as fast as 40Mbps, uses a common clock signal, and on the low level,
                    the underlying implementation is a single FIFO register.
                </p>
                <p>
                    Here we can see a simple SPI master with three SPI slave devices.
                    Each slave has a joint clock signal (SCLK), a joint input (MOSI, master out slave in),
                    a joint output (MISO, master in slave out), and separate chip select (CS) or slave select (SS) lines.
                    Only the active slave will read and write on the MOSI/MISO lines.
                </p>
                <p>
                    The SS/CS signal is always active low.
                </p>
                <Popup trigger={<img className="img_timing_1 clickable_img" src={timing_1} alt="timing_1" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={timing_1} alt="timing_1" />
                    )}
                </Popup>
                <p>
                    The SCK/SCLK can be active-low or active-high depending on the CPOL setting (0 or 1 respectively).
                    CPOL = 0 means that the idle is 0, the leading edge is a rising edge, the trailing edge is a falling one.
                </p>
                <p>
                    CPHA determines the timing of the MOSI/MISO signals relative to the clock.
                    CPHA = 0 means the out side changes the data on trailing, while the in side captures on leading-edge.
                </p>
                <p>
                    We use the terms Mode 0, 1, 2 or 3 when we are talking about an SPI protocol with
                    CPOL = 0 / CPHA = 0, CPOL = 0 / CPHA = 1, CPOL = 1 / CPHA = 0, or CPOL = 1 / CPHA = 1 respectively.
                </p>
                <p>
                    In this tutorial, we will design the SPI communication with Mode 3.
                    We design the master first. I'm using Visual Studio Code with Platform IO IDE, but the
                    Arduino IDE can also be used because the compiler is the same.
                </p>

                <SyntaxHighlighter language="verilog" style={AtomOneDark}>
                    {arduino}
                </SyntaxHighlighter>

                <p>
                    Here we use the default SPI pin configuration, with software-controlled CS pin.
                    We configure the UART and the SPI, and the output pins at the start of the program.
                    The LED pin is used to indicate the start and end of a transaction visually.
                    We activate the slave by pulling the CS pin down.
                    The Master clock speed is configured for 1MHz, and we use the Mode 3.
                    We transfer a number from 0 to 255 in each second, and we also display the received value on the serial port.
                </p>
                <p>
                    We saw that the master device generates the SCLK signal, sends the MOSI and expects the MISO signal synchronous to SCLK.
                    The problem is that the slave device has its clock signal, so we need to re-sample and synchronise the incoming data to the system clock.
                </p>
                <SyntaxHighlighter language="verilog" style={AtomOneDark}>
                    {sync_clk}
                </SyntaxHighlighter>
                <p>
                    Here I've shown the synchronisation of the SCLK signal.
                    Usually, a 2b wide register is enough for the purpose,
                    but we need to check the rising and falling edges, for which we will be using the two upper bits.
                </p>
                <Popup trigger={<img className="img_logic_1 clickable_img" src={logic_1} alt="logic_1" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={logic_1} alt="logic_1" />
                    )}
                </Popup>
                <p>
                    We can see the implemented design, which generates the rising and falling edge signals from the synchronised SCLK line.
                    We can use the same implementation for the SS line to synchronize and to detect the edges.
                </p>
                <p>
                    We can implement the MOSI synchronisation with only two flip-flops since the edge detection is not required.
                    The slave internally generates the MISO data, so no synchronisation is necessary.
                </p>
                <p>
                    Operation logic: So far, we have the synchronized signals.
                    The next step is to use an internal 8b storage register and implement the SPI mode3 protocol:
                </p>
                <ul>
                    <li>Transfer the MSB of the storage register to the MISO on the falling edge of SCLK.</li>
                    <li>Shift the MOSI data to the LSB position of the storage register on the rising edge of SCLK.</li>
                </ul>
                <p>
                    Lastly, we will strobe a flag (rx_valid) if we received 8b worth of data.
                </p>

                <Popup trigger={<img className="img_spi_byte_schem clickable_img" src={spi_byte_schem} alt="spi_byte_schem" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={spi_byte_schem} alt="spi_byte_schem" />
                    )}
                </Popup>

                <p>
                    We can see the top module on the image to the right.
                    Let's set the internal oscillator frequency to 7MHz,
                    and configure the PLL to output a clock signal with 280MHz frequency.
                    We can connect the SPI signals to our module.
                </p>
                <p>
                    We use a register to store the RX data when we get a rx_valid signal.
                    We use this stored value in the next transfer cycle as TX data.
                    The figure below shows the expected simulation results so that we can upload the bit file to the device.
                </p>

                <Popup trigger={<img className="img_spi_spi_byte clickable_img" src={spi_byte} alt="spi_byte" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={spi_byte} alt="spi_byte" />
                    )}
                </Popup>

                <p>
                    A strange phenomenon happens when we upload the JEDEC file.
                    The Arduino sends an increasing data sequence from 0 to 255,
                    but the slave sends back a somewhat non-deterministic data (increases randomly, jumps back, stops increasing).
                </p>

                <p>
                    We can change the PLL output frequency to 105MHz, and upload the newly generated JEDEC file.
                    Now the warnings are gone, and the slave response is the previously sent data byte, as expected.
                </p>

                <p>
                    You can download the project files for the master and slave device from
                    <a href="https://gitlab.com/Csurleny/fpga-tutorial-files/-/tree/main/spiByteModule">here</a>.
                </p>


            </div>
            <div className="em__post-section">
                <h3>SPI register interface</h3>
                <p>
                    It would be nice if we could read from and write in 32b registers with our small SPI interface, but so far it is not possible.
                    In this part of the tutorial, we will implement a message interface (16 total registers at maximum) with a status register.
                </p>
                <p>
                    Let the address of the status register be 0x00, the address range of the read registers between 0x80 and 0x8F,
                    and the address range of the write registers between 0xC0 and 0xCF.
                    Further down the line, we could expand the register state in the range of 0x01
                    - 0x0F (this is ideal for control options).
                </p>
                <p>
                    Every data exchange starts with the master sending the appropriate address value while ignoring the incoming data.
                    In case of the status, the master sends a dummy byte and reads the actual value.
                    The master sends 4 Bytes of dummy data and interprets the incoming values in the case of the reading operation.
                    Lastly, the master sends the 32b TX data in four 1 Byte chunks.
                </p>
                <p>
                    In our implementation, we will have three registers, the first two are r/w, and the last one will be read-only.
                    We will use this register scheme to implement the arithmetic core latter on (e.g. R3 = R1 + R2).
                </p>
                <p>
                    Let's have a few words about the SPI master.
                </p>
                <p>
                    We will similarly design the master to the previous implementation.
                    The Arduino should print an options menu on the terminal: one for status check,
                    two for write test, three read test, four flag test.
                    The master goes into an infinite loop after printing the messages and waits for user input.
                </p>
                <p>
                    If we type 1, the master will send 0x00 and dummy data (0xFF).
                    If the received value is 0x69 (the slave ID), the status check is successful.
                </p>
                <p>
                    If we type 2, the master will ask for the register number (0 or 1),
                    then it will write data to the specific register (0xC0 or 0xC1),
                    and it will read back the value from the register for checking.
                    The write test is successful if the written ad read values are the same.
                    If we type 3, the master will read the value from the read-only register,
                    and compare its value with the expected (0xDEADBEEF) constant.
                    The read test is successful if the values are the same.
                    If we type 4, the master will send a start flag (0x01) which turns on an LED.
                    Any consecutive data transaction should turn the LED off.
                    I will put the code of the Arduino master at the end of this section since that is not the main focus here.
                </p>
                <p>
                    We will write the hardware description in System Verilog because there we can use multi-dimensional arrays.
                    If we have the number of R/W and RO registers as parameters <i>NR_RWREGS</i> and <i>NR_ROREGS</i>
                    , respectively, the overall register structure will be:
                </p>
                <SyntaxHighlighter language="verilog" style={AtomOneDark}>
                    {registers}
                </SyntaxHighlighter>
                <p>
                    Otherwise, we would have to flatten the 2D register structure into a 1D register
                    (the silicon won't mind, and the synthesiser will optimise the structure to fit the parameters,
                    but it's more readable this way).
                </p>
                <Popup trigger={<img className="img_fsm_1 clickable_img" src={fsm_1} alt="fsm_1" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={fsm_1} alt="fsm_1" />
                    )}
                </Popup>
                <p>
                    We will implement a finite state machine to parse and execute the received commands:
                </p>
                <ul>
                    <li>Idle state (0).</li>
                    <li>Transmit Status (TS) state (1): sends the slave ID to the master.</li>
                    <li>Start Flag (SF) state (2): sets a start flag register.</li>
                    <li>Transmit Register value (TR) state (3): sends the required register value.</li>
                    <li>Receive Register value (RR) state (4): receives a 32b value from master.</li>
                </ul>
                <p>
                    We saw in previous tutorials how to use the case instruction,
                    but there are some wild varieties of the simple case - casez and casex, which we will use here.
                </p>
            </div>
            <div className="em__post-section">
                <SyntaxHighlighter language="verilog" style={AtomOneDark}>
                    {casez}
                </SyntaxHighlighter>
                <p>
                    casez - allows statements 'Z' and '?' to be treated as don't care:
                    the statement 2'bZ0 can match 2'b00, 2'b10 and 2'bZ0, which is ideal for a priority decoder.
                    The casez offers an elegant way for the if-else replacement.
                </p>
                <p>
                    casex - allows statements 'X', 'Z' and '?' to be treated as don't care values.
                    'X' values can propagate through design and will mask some design issues.
                    These 'X' values can cause problems when they get into a casex problem,
                    so the best practice is to omit the usage of casex.
                    This means we will use it :).
                </p>
                <p>
                    All of the source files are located here.
                    It shouldn't be hard to understand the hardware description of the slave if you've followed the tutorials.
                </p>
                <Popup trigger={<img className="img_spi_msg clickable_img" src={spi_msg} alt="spi_msg" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={spi_msg} alt="spi_msg" />
                    )}
                </Popup>
                <p>
                    In the simulation, we can see that the slave status is transferred successfully
                    (the old status was 0x5A, the new one is 0x69). We can write into registers 0 and 1.
                    We can read the stored message from the RO register
                    (this is shown between transitions 14 - 17 of tx_wire, or in the MISO line for people keen on binary signals).
                </p>
                <p>
                    We can test the hardware setup after programming the Arduino with the given SPI master code,
                    and after the download of the JEDEC file to the FPGA.
                    You can download the project files for the master and slave device from
                    <a href="https://gitlab.com/Csurleny/fpga-tutorial-files/-/tree/main/spiRegisterModule">here</a>.
                </p>

            </div>
            <div className="em__post-section">
                <h3>Bonus: Minimalist ADD/MUL accelerator</h3>
                <p>
                    If you did arrive at this point, you could easily add custom IPs to the FPGA.
                    The IP core will read the data from the R/W registers and will write the result to the RO register.
                    Sending some start flag will initiate the operation, and the IP could generate external interrupts if necessary.
                    Now, of course, the IP must fit in the remaining space in the FPGA.
                </p>
                <p>
                    Here I will show how to design a simple arithmetic accelerator for unsigned integer addition and multiplication.
                </p>
                <SyntaxHighlighter language="verilog" style={AtomOneDark}>
                    {arithmetic}
                </SyntaxHighlighter>
                <p>
                    Here we use a case syntax to distinguish the addition from the multiplication.
                    We extend the input numbers taking into consideration the sign of the numbers.
                    Finally, we concatenate the result to match the output data width. We can see in the simulation,
                    that the arithmetic core works as intended.
                </p>
                <Popup trigger={<img className="img_arith clickable_img" src={arith} alt="arith" />} modal nested>
                    {close => (
                        <img className="em__img_full" src={arith} alt="arith" />
                    )}
                </Popup>
                <p>We can add the following commands to the master side program:</p>
                <ol>
                    <li>Send data to reg1 : -111 </li>
                    <li>Send data to reg2: -1</li>
                    <li>Write OP1</li>
                    <li>Read reg3: ~ Here we should see -112 on the terminal (-111 + (-1) )</li>
                    <li>Write OP2</li>
                    <li>Read reg3: ~ Here we should see 111 on the terminal (-111 * (-1))</li>
                </ol>

            </div>

            <div className="em__post-navigation">

                <NavLink to="./../fpga-tut-8">
                    <Button btnID={"leftBTN"} buttonSize="btn--medium"> Previous Post</Button>
                </NavLink>

                <NavLink to="./../fpga-tut-10">
                    <Button btnID={"rightBTN"} buttonSize="btn--medium"> Next Post</Button>
                </NavLink>
            </div>
        </div>
    )
}

export default FPGA_9;