
// SPDX-License-Identifier: CC-BY-NC-SA-4.0
//
// Copyright (C) 2025 Bit by Bit Signal Processing LLC  (https://bxbsp.com)
//
// This work is placed under the "Creative Commons Attribution
// NonCommercial ShareAlike 4.0 International" license, known
// by the shortened acronym "CC-BY-NC-SA-4.0".
//
// This work is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// A CC-BY-NC-SA-4.0 license allows you to use, distribute, and modify
// this work, so long as such uses are non-commercial in nature,
// so long as any derived works are offered on the same terms,
// and so long as attribution is given to the original author.
// For further details, see the Creative Commons License
// "CC-BY-NC-SA-4.0".
//
// You should have received a copy of the CC-BY-NC-SA-4.0 license
// along with this work. If not, see
// <https://creativecommons.org/licenses/by-nc-sa/4.0/>.
//

`timescale 1ns / 1ps

`define w assign
`define r always@(posedge clk)

//
// Uses a linear approximation to a sine.  The table is generated
// elsewhere, but is made to simply interpolate between sine points.
// Table length is typically 1024, with a 36-bit table for a single
// 1k x 36 bit memory.  Accuracy is around +/-2e-5 full scale, which
// is a little less than +/- 1 LSB of a 16-bit output word.
// Input is typically 32-bit, but that's clearly overkill.  Just
// because it's convenient to not figure out how small it can be
// made, and having smaller doesn't really save many resources.
//
   
module sine
  #(
    // Input parameters
    parameter SAMPLES_PER_CLOCK            = 1,
    parameter IN_AXIS_BITS_PER_REAL        = 32,
 
    // Output parameters
    parameter OUT_AXIS_BITS_PER_REAL       = 16,

        // AXIS side data parameters
    parameter TID_WIDTH                    = 0,
    parameter TDEST_WIDTH                  = 0,

    // Processing parameters (not changeable)
    parameter TABLE_ADDRESS_BITS           = 10,
    parameter TABLE_VALUE_BITS             = 21,
    parameter TABLE_SLOPE_BITS             = 15,

    // Dependent parameters
    parameter SINE_DELAY                   = 9,
    parameter NUM_ROMS                     = (SAMPLES_PER_CLOCK+1)/2,
    parameter TABLE_LENGTH                 = 1<<TABLE_ADDRESS_BITS,
    parameter IN_REMAINDER_BITS            = IN_AXIS_BITS_PER_REAL - TABLE_ADDRESS_BITS, 
    parameter TABLE_BITS                   = TABLE_VALUE_BITS + TABLE_SLOPE_BITS,
    parameter OUT_SHIFT_BITS               = TABLE_VALUE_BITS - OUT_AXIS_BITS_PER_REAL,
    parameter IN_AXIS_BUS_WIDTH            = IN_AXIS_BITS_PER_REAL  * SAMPLES_PER_CLOCK,
    parameter OUT_AXIS_BUS_WIDTH           = OUT_AXIS_BITS_PER_REAL * SAMPLES_PER_CLOCK,
    parameter TID_DUMMY_WIDTH              = (TID_WIDTH==0) ? 1 : TID_WIDTH,
    parameter TDEST_DUMMY_WIDTH            = (TDEST_WIDTH==0) ? 1 : TDEST_WIDTH
    )
   (
    // Data input and output

    (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 clk CLK" *)
    (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF IN_AXIS:OUT_AXIS:OUT_DUP_AXIS, ASSOCIATED_RESET resetn" *)
    input                                 clk,

    (* X_INTERFACE_INFO = "xilinx.com:signal:reset:1.0 resetn RST" *)
    input                                 resetn,

    input                                 IN_AXIS_tvalid,
    output wire                           IN_AXIS_tready,
    input [IN_AXIS_BUS_WIDTH-1:0]         IN_AXIS_tdata,
    input                                 IN_AXIS_tlast,
    input [TID_DUMMY_WIDTH-1:0]           IN_AXIS_tid,
    input [TDEST_DUMMY_WIDTH-1:0]         IN_AXIS_tdest,

    input [4:0]                           shift_right_i,
    
    output reg                            OUT_AXIS_tvalid,
    input                                 OUT_AXIS_tready,
    output reg [OUT_AXIS_BUS_WIDTH-1:0]   OUT_AXIS_tdata,
    output reg                            OUT_AXIS_tlast,
    output reg [TID_DUMMY_WIDTH-1:0]      OUT_AXIS_tid,
    output reg [TDEST_DUMMY_WIDTH-1:0]    OUT_AXIS_tdest,

    output reg                            OUT_DUP_AXIS_tvalid,
    input                                 OUT_DUP_AXIS_tready,
    output reg [OUT_AXIS_BUS_WIDTH-1:0]   OUT_DUP_AXIS_tdata,
    output reg                            OUT_DUP_AXIS_tlast,
    output reg [TID_DUMMY_WIDTH-1:0]      OUT_DUP_AXIS_tid,
    output reg [TDEST_DUMMY_WIDTH-1:0]    OUT_DUP_AXIS_tdest
    );

   genvar                                  i;
   genvar                                  m;

   initial OUT_AXIS_tvalid = 0;
   initial OUT_AXIS_tlast = 0;
   initial OUT_DUP_AXIS_tvalid = 0;
   initial OUT_DUP_AXIS_tlast = 0;
   
   `w IN_AXIS_tready = 1'b1;

   // One extra address value, since ROM takes addresses in pairs.  Then if the
   // number is odd there's not one left with an unknown address.
   wire [TABLE_ADDRESS_BITS-1:0]           in_addr_w [ 0 : SAMPLES_PER_CLOCK ];
   wire [TABLE_BITS-1:0]                   table_w   [ 0 : SAMPLES_PER_CLOCK ];

   `w in_addr_w [SAMPLES_PER_CLOCK] = 0;

   reg [4:0]                               shift_right_r;

   `r shift_right_r <= shift_right_i;
                                           
   generate
      for(i=0; i<SAMPLES_PER_CLOCK; i=i+1)
        begin
           reg [IN_AXIS_BITS_PER_REAL-1:0]           in_r;
           wire [IN_REMAINDER_BITS-1:0]              in_remainder_w;
           reg  [IN_REMAINDER_BITS-1:0]              in_remainder_r;
           reg  signed [IN_REMAINDER_BITS:0]         in_remainder_rr;
           reg  signed [IN_REMAINDER_BITS:0]         in_remainder_rrr;
           reg  signed [IN_REMAINDER_BITS:0]         in_remainder_rrrr;
           wire [TABLE_BITS-1:0]                     table_rrr;
           wire  signed [TABLE_VALUE_BITS-1:0]       value_rrr;
           reg  signed [47:0]                        value_rrrr;
           reg  signed [47:0]                        value_rrrrr;
           wire signed [TABLE_SLOPE_BITS-1:0]        slope_rrr;
           reg  signed [TABLE_SLOPE_BITS-1:0]        slope_rrrr;
           reg  signed [47:0]                        answer_r;
           reg  signed [47:0]                        answer_rr;
           reg  signed [47:0]                        mult_rrrrr;
           reg  signed [47:0]                        out_w;
           reg  signed [OUT_AXIS_BITS_PER_REAL-1:0]  out_r;
           
           `r in_r <= IN_AXIS_tdata[ i * IN_AXIS_BITS_PER_REAL +: IN_AXIS_BITS_PER_REAL ];

           `w in_addr_w[i]    =  in_r [ IN_REMAINDER_BITS +: TABLE_ADDRESS_BITS ];
           `w in_remainder_w  =  in_r [ 0 +: IN_REMAINDER_BITS ];

           // delay of 2 from in_addr_w to table_rr.  Match to in_remainder_rr

           `r in_remainder_r    <= in_remainder_w;

           // Verilog may not be smart enough to do a signed x unsigned multiply.  I've seen
           // issues.  in_remainder_rr is unsigned.  To prevent issues, add a leading zero
           // and specify it as signed.  Then it is an always-positive signed number, and
           // Verilog can be happy doing a signed x signed multiply.
           `r in_remainder_rr   <= { 1'b0, in_remainder_r };

           `w table_rrr = table_w  [ i ];
           `w value_rrr = table_rrr [ TABLE_SLOPE_BITS +: TABLE_VALUE_BITS ];
           `w slope_rrr = table_rrr [ 0 +: TABLE_SLOPE_BITS ];

           // Extra delays before multiply, to allow Vivado to pipeline it.
           
           `r in_remainder_rrr   <= in_remainder_rr;
           `r in_remainder_rrrr  <= in_remainder_rrr;
           `r value_rrrr         <= (value_rrr<<IN_REMAINDER_BITS);
           `r value_rrrrr        <= value_rrrr;
           `r slope_rrrr         <= slope_rrr;

           `r mult_rrrrr   <= in_remainder_rrrr * slope_rrrr;
           
           `r answer_r <= value_rrrrr + mult_rrrrr + (1<<(OUT_SHIFT_BITS+IN_REMAINDER_BITS-1));

           `r answer_rr <= answer_r;
           
           `w out_w =  answer_rr >>> (IN_REMAINDER_BITS+OUT_SHIFT_BITS);

           `r out_r <=  out_w >>> shift_right_r;

           `r OUT_AXIS_tdata     [ i * OUT_AXIS_BITS_PER_REAL +: OUT_AXIS_BITS_PER_REAL ] <= out_r;
           `r OUT_DUP_AXIS_tdata [ i * OUT_AXIS_BITS_PER_REAL +: OUT_AXIS_BITS_PER_REAL ] <= out_r;
        end
   endgenerate
   
   generate
      for(m=0; m<NUM_ROMS; m=m+1)
        begin

           wire [TABLE_ADDRESS_BITS-1:0] this_in_addr_0_w;
           wire [TABLE_ADDRESS_BITS-1:0] this_in_addr_1_w;
           wire [TABLE_BITS-1:0]         this_table_0_w [ 0 : 1 ];
           wire [TABLE_BITS-1:0]         this_table_1_w [ 0 : 1 ];

           `w this_in_addr_0_w = in_addr_w [2*m];
           `w this_in_addr_1_w = in_addr_w [2*m+1];
           
           sine_two_port_rom #( .ROM_FILENAME       ( "sine_table.dat"                       ),
                                .ROM_FILE_OFFSET    ( 0                                      ),
                                .ROM_FILE_LENGTH    ( TABLE_LENGTH                           ),
                                .ROM_ITEM_WIDTH     ( TABLE_BITS                             ),
                                .ROM_ITEM_NUMBER    ( 1                                      ),
                                .ROM_LENGTH         ( TABLE_LENGTH                           ))
           coeff_rom          ( .clk                ( clk                                    ),
                                .addr_0_i           ( this_in_addr_0_w                       ),
                                .addr_1_i           ( this_in_addr_1_w                       ),
                                .data_0_ro          ( this_table_0_w                         ),
                                .data_1_ro          ( this_table_1_w                         ));

           `w table_w [2*m]   = this_table_0_w [ 0 ];
           `w table_w [2*m+1] = this_table_1_w [ 0 ];

        end
   endgenerate
           

   wire                                  OUT_DUP_AXIS_tvalid_pre;
   wire                                  OUT_DUP_AXIS_tlast_pre;
   wire [TID_DUMMY_WIDTH-1:0]            OUT_DUP_AXIS_tid_pre;
   wire [TDEST_DUMMY_WIDTH-1:0]          OUT_DUP_AXIS_tdest_pre;

   delay_with_reset #( .DELAY(SINE_DELAY), .BIT_WIDTH(1) ) vdelay ( .reset(!resetn), .clk(clk), .in_i(IN_AXIS_tvalid), .out_o(OUT_AXIS_tvalid_pre) );
   delay_with_reset #( .DELAY(SINE_DELAY), .BIT_WIDTH(1) ) ldelay ( .reset(!resetn), .clk(clk), .in_i(IN_AXIS_tlast),  .out_o(OUT_AXIS_tlast_pre)  );

   `r OUT_AXIS_tvalid     <= OUT_AXIS_tvalid_pre;
   `r OUT_AXIS_tlast      <= OUT_AXIS_tlast_pre;

   `r OUT_DUP_AXIS_tvalid <= OUT_AXIS_tvalid_pre;
   `r OUT_DUP_AXIS_tlast  <= OUT_AXIS_tlast_pre;


   if(TID_WIDTH>0)
     begin
        delay_with_reset #( .DELAY(SINE_DELAY), .BIT_WIDTH(TID_WIDTH) )   tid_delay ( .reset(!resetn), .clk(clk), .in_i(IN_AXIS_tid),  .out_o(OUT_AXIS_tid_pre)  );

        `r OUT_AXIS_tid     <= OUT_AXIS_tid_pre;
        `r OUT_DUP_AXIS_tid <= OUT_AXIS_tid_pre;

     end

   if(TDEST_WIDTH>0)
     begin
        delay_with_reset #( .DELAY(SINE_DELAY), .BIT_WIDTH(TDEST_WIDTH) ) tdest_delay ( .reset(!resetn), .clk(clk), .in_i(IN_AXIS_tdest),  .out_o(OUT_AXIS_tdest_pre)  );

        `r OUT_AXIS_tdest     <= OUT_AXIS_tdest_pre;
        `r OUT_DUP_AXIS_tdest <= OUT_AXIS_tdest_pre;

     end
      

   
   
endmodule
