//
// Copyright (C) 2018 Ross Martin
//
// Versions
//
// 2018.01.25  Coding begun

`undef w
`define w assign
`undef r
`define r always@(posedge clk)
`timescale  1 ps / 1 ps

`include "bram_based_ram.vh"

/* verilator lint_off WIDTHEXPAND  */
/* verilator lint_off WIDTHTRUNC  */
/* verilator lint_off LITENDIAN  */

   
   //
   // Old code
   //
   // `define URAM_LENGTH_THRESHOLD 9000000
   //   parameter  BRAM_VALUE_IN_LUTS    = 200,                                       // Use a BRAM if it saves this many LUTS
   //   parameter  URAM_LENGTH           = DELAY-9,
   //
   //// If BIT_WIDTH and  DELAY will substantially use it, go to ultra ram
   //   parameter  USE_ULTRA_RAM         = (DELAY>65) && ( URAM_LENGTH >= `URAM_LENGTH_THRESHOLD ) || 
   //                                      ( URAM_LENGTH>0 &&
   //                                      ( ( (BIT_WIDTH%72)     >   72 * 7 / 8 ) || ( (BIT_WIDTH%72)     == 0 ) ) &&
   //                                      ( ( (URAM_LENGTH%4096) > 4096 * 7 / 8 ) || ( (URAM_LENGTH%4096) == 0 ) ) ),
   //
   //   parameter  USE_BRAM              = (DELAY>65) && (BRAMS_USED * BRAM_VALUE_IN_LUTS < LUTS_USED) ? 1 : 0,  // Use BRAMs if they are less value than the LUTS 

   
(* DONT_TOUCH = "YES" *)
module delay
  #( 
     parameter   DELAY                = 0,
     parameter   BIT_WIDTH            = 1,

     parameter   TOTAL_BITS           = BIT_WIDTH*DELAY,
     parameter   SHIFTED_BITS         = TOTAL_BITS - BIT_WIDTH,

     parameter   LUTS_USED            = ( (DELAY<=17)   ?  BIT_WIDTH / 2                 :                    // SRL16
					  /**/             ((DELAY+30)/32) * BIT_WIDTH   ),                   // SRL32

     parameter   BRAMS_USED           = ( (DELAY<=513)  ?  (BIT_WIDTH+71)/72                       :         // 32bit x 1024, dual port
					  (DELAY<=1025) ?  (BIT_WIDTH+35)/36                       :         // 32bit x 1024, single port
					  (DELAY<=2049) ?  (BIT_WIDTH+17)/18                       :         // 18bit x 2048, single port
					  (DELAY<=4097) ?  (BIT_WIDTH+8)/9                         :         // 9bit  x 4096, single port
					  /**/             ((BIT_WIDTH+8)/9) * ((DELAY+4095)/4096) ),        // 9bit  x 4096, single port, multiple

     parameter   BRAM_DESIRABILITY_PERCENT    = 50,  // Scale of 0 to 100
     parameter   URAM_DESIRABILITY_PERCENT    = 50,  // Scale of 0 to 100

     localparam  BRAM_VALUE_IN_LUTS            =  200 * (100-BRAM_DESIRABILITY_PERCENT) / 50,  // Use a BRAM if it saves this many LUTS
     localparam  URAM_FILL_PERCENTAGE          =   87 * (100-URAM_DESIRABILITY_PERCENT) / 50,  // Use URAM if its use is this percentage efficient
     localparam  URAM_LENGTH                   =  DELAY-9,
     localparam  URAM_LENGTH_THRESHOLD         =  131072,  // Current bram_based_ram can't handle bigger than 131072
     localparam  USES_SUBSTANTIAL_URAM_WIDTH   =  ((BIT_WIDTH%72)     > 72   * URAM_FILL_PERCENTAGE/100) || ((BIT_WIDTH%72)==0),
     localparam  USES_SUBSTANTIAL_URAM_LENGTH  =  ((URAM_LENGTH%4096) > 4096 * URAM_FILL_PERCENTAGE/100) || ((URAM_LENGTH%4096)==0),
     
     localparam  USE_ULTRA_RAM         = (  (DELAY<=17)                                                       ?  0  :
					    (URAM_DESIRABILITY_PERCENT==100)                                  ?  1  :
                                            (URAM_DESIRABILITY_PERCENT==0)                                    ?  0  :
					    (DELAY<=65)                                                       ?  0  :
                                            (URAM_LENGTH>=URAM_LENGTH_THRESHOLD)                              ?  1  :
                                            (USES_SUBSTANTIAL_URAM_WIDTH && USES_SUBSTANTIAL_URAM_LENGTH)     ?  1  :
                                            /**/                                                                 0  ),

     localparam  USE_BRAM              = (  (DELAY<=17)                                                       ?  0  :
					    (BRAM_DESIRABILITY_PERCENT==100)                                  ?  1  :
                                            (BRAM_DESIRABILITY_PERCENT==0)                                    ?  0  :
					    (DELAY<=65)                                                       ?  0  :
                                            (USE_ULTRA_RAM)                                                   ?  0  :
                                            (BRAMS_USED * BRAM_VALUE_IN_LUTS < LUTS_USED)                     ?  1  : 
                                            /**/                                                                 0  ),
     
     localparam  DELAY_BRAM_IMPLEMENTATION_TYPE = "XILINX_OPTIMIZED"
     //localparam  DELAY_BRAM_IMPLEMENTATION_TYPE = "INFERENCE"
   )
   (
     input                         clk,
     input  [BIT_WIDTH-1:0]        in_i,
     output [BIT_WIDTH-1:0]        out_o
   );

   //initial $display("DELAY used with BIT_WIDTH=%0d and DELAY=%0d", BIT_WIDTH, DELAY);

   //initial $display("BIT_WIDTH=%0d, DELAY=%0d, LUTS_USED=%0d, BRAMS_USED=%0d, USE_BRAM=%0d",
//		    BIT_WIDTH, DELAY, LUTS_USED, BRAMS_USED, USE_BRAM);

   initial
     begin
	if(USE_ULTRA_RAM)
	  $display("delay instantiated with DELAY=%0d, BIT_WIDTH=%0d in ULTRARAM", DELAY, BIT_WIDTH);
	else if(USE_BRAM)
	  $display("delay instantiated with DELAY=%0d, BIT_WIDTH=%0d in BRAM", DELAY, BIT_WIDTH);
	//else
	 // $display("delay instantiated with size RAM_ITEM_WIDTH=%0d, RAM_ITEM_NUMBER=%0d, RAM_LENGTH=%0d in DISTRIBUTED LUT MEMORY", RAM_ITEM_WIDTH, RAM_ITEM_NUMBER, RAM_LENGTH);
     end

   //
   // If delay is 1.75 times the next lower power of 2, then we can break it up into
   // a delay of 1, 0.5, and 0.25 for a memory savings over implementing a full 2.
   // If it's any higher than that, the memory savings starts to become marginal,
   // and this uses extra logic.
   //
   // Modified because 1.75 often gave higher power without immense BRAM savings.
   // Instead, switch to 1.5 to make sure it is only done when BRAM savings is larger.
   //
   // Also, require BRAMS_USED>20
   //
   localparam NEXT_LOWER_POWER_OF_2_DELAY = (DELAY<4) ? 1 : 1 << ( $clog2(DELAY-2) - 1 );
   localparam BETTER_TO_SPLIT_DELAY       = (BRAMS_USED>20) && (DELAY>513) && ( (DELAY-1) * 2 <= NEXT_LOWER_POWER_OF_2_DELAY * 3 );
   
   generate
      if(DELAY<0)
        begin : gen_blk_delay_negative
           error_module error_negative_delay_value (clk); // Negative delay value
        end
      else if(DELAY==0)
        begin : gen_blk_delay_zero
           `w out_o = in_i;
        end
      else if(DELAY==1)
        begin : gen_blk_delay_1
           reg [BIT_WIDTH-1:0] buffer_r = 0;
           `r buffer_r <= in_i;
           `w out_o = buffer_r;
        end
      else if( ((USE_ULTRA_RAM || USE_BRAM) && BETTER_TO_SPLIT_DELAY ) || ((!USE_ULTRA_RAM && !USE_BRAM) && DELAY>150 ) )
	begin : gen_blk_split
	   // Break delay up into a first power-of-2 delay and a second non-power-of-2 delay
	   wire [BIT_WIDTH-1:0] mid_delay_w;
	   
	   delay          #( .BRAM_DESIRABILITY_PERCENT  (  BRAM_DESIRABILITY_PERCENT            ),
                             .URAM_DESIRABILITY_PERCENT  (  URAM_DESIRABILITY_PERCENT            ),
                             .DELAY                      (  NEXT_LOWER_POWER_OF_2_DELAY+1        ),
                             .BIT_WIDTH                  (  BIT_WIDTH                            )) 
           first_delay     ( .clk                        (  clk                                  ),  
                             .in_i                       (  in_i                                 ),
                             .out_o                      (  mid_delay_w                          ));

	   delay          #( .BRAM_DESIRABILITY_PERCENT  (  BRAM_DESIRABILITY_PERCENT            ),
                             .URAM_DESIRABILITY_PERCENT  (  URAM_DESIRABILITY_PERCENT            ),
                             .DELAY                      (  DELAY-NEXT_LOWER_POWER_OF_2_DELAY-1  ),
                             .BIT_WIDTH                  (  BIT_WIDTH                            )) 
           second_delay    ( .clk                        (  clk                                  ),  
                             .in_i                       (  mid_delay_w                          ),
                             .out_o                      (  out_o                                ));
	   
	end
      else if(USE_ULTRA_RAM)
        begin : gen_blk_use_ultra_ram
	   // This doesn't work, for unknown reasons.
	   //(* srl_style = "block" *) reg [TOTAL_BITS-1:0] buffer_r = 0;
           //`r buffer_r <= { buffer_r[SHIFTED_BITS-1:0], in_i };
           //`w out_o = buffer_r[SHIFTED_BITS +: BIT_WIDTH];

	   initial $display("Using Xilinx-Optimized URAM DELAY for DELAY=%0d", DELAY);

           localparam MAX_COUNT    = DELAY-9;
	   localparam COUNTER_BITS = $clog2(MAX_COUNT+1);

	   reg [COUNTER_BITS-1:0] counter_r  = 0;
	   reg [COUNTER_BITS-1:0] counter_rr = 0;
	   reg [COUNTER_BITS-1:0] counter_rrr = 0;

	   (* RAM_STYLE = "ultra" *)
	   reg [BIT_WIDTH-1:0] 	  uram_t[0:MAX_COUNT+1];
           
           initial
             begin
                integer j;
                for(j=0; j<=MAX_COUNT; j=j+1)
                  uram_t[j] = 0;
             end

           reg [BIT_WIDTH-1:0]    in_r             = 0;
           (* DONT_TOUCH = "YES" *) reg [BIT_WIDTH-1:0]    in_rr            = 0;
	   reg [BIT_WIDTH-1:0] 	  read_data_r      = 0;
	   reg [BIT_WIDTH-1:0] 	  read_data_rr     = 0;
	   reg [BIT_WIDTH-1:0] 	  read_data_rrr    = 0;
	   reg [BIT_WIDTH-1:0] 	  read_data_rrrr   = 0;

           // Note:  Must put don't touch on second-to-last, not last, because
           // the DONT_TOUCH doesn't prevent it from being turned into a wire.
	   (* DONT_TOUCH = "YES" *) reg [BIT_WIDTH-1:0]    read_data_rrrrr  = 0;
           reg [BIT_WIDTH-1:0]    read_data_rrrrrr = 0;

           `r in_r <= in_i;
           `r in_rr <= in_r;
           
	   `r counter_r     <= (counter_r==MAX_COUNT) ? 0 : counter_r+1;
	   `r counter_rr    <= counter_r;
	   `r counter_rrr   <= counter_rr;

	   `r uram_t[counter_rrr] <= in_rr;
	   
	   `r read_data_r      <= uram_t[counter_rrr];
	   `r read_data_rr     <= read_data_r;
	   `r read_data_rrr    <= read_data_rr;
	   `r read_data_rrrr   <= read_data_rrr;
	   `r read_data_rrrrr  <= read_data_rrrr;
	   `r read_data_rrrrrr <= read_data_rrrrr;
	   `w out_o = read_data_rrrrrr;
        end
      else if(USE_BRAM && DELAY_BRAM_IMPLEMENTATION_TYPE=="INFERENCE")
        begin : gen_blk_use_bram_inference
	   // This doesn't work, for unknown reasons.
	   //(* srl_style = "block" *) reg [TOTAL_BITS-1:0] buffer_r = 0;
           //`r buffer_r <= { buffer_r[SHIFTED_BITS-1:0], in_i };
           //`w out_o = buffer_r[SHIFTED_BITS +: BIT_WIDTH];

           localparam MAX_COUNT    = DELAY-6;
	   localparam COUNTER_BITS = $clog2(MAX_COUNT+1);

	   reg [COUNTER_BITS-1:0] counter_r  = 0;
	   reg [COUNTER_BITS-1:0] counter_rr = 0;

	   (* RAM_STYLE = "block" *)  (* cascade_height = 2 *)
	   reg [BIT_WIDTH-1:0] 	  bram_t[0:MAX_COUNT+1];

           initial
             begin
                integer j;
                for(j=0; j<=MAX_COUNT; j=j+1)
                  bram_t[j] = 0;
             end
           
           reg [BIT_WIDTH-1:0]    in_r          = 0;
	   reg [BIT_WIDTH-1:0] 	  read_data_r   = 0;
	   reg [BIT_WIDTH-1:0] 	  read_data_rr  = 0;

           // Note:  Must put don't touch on second-to-last, not last, because
           // the DONT_TOUCH doesn't prevent it from being turned into a wire.
	   (* DONT_TOUCH = "YES" *) reg [BIT_WIDTH-1:0] read_data_rrr = 0;
	   reg [BIT_WIDTH-1:0]    read_data_rrrr = 0;

           `r in_r <= in_i;
           
	   `r counter_r  <= (counter_r==MAX_COUNT) ? 0 : counter_r+1;
	   `r counter_rr <= counter_r;

	   `r bram_t[counter_rr] <= in_r;
	   
	   `r read_data_r    <= bram_t[counter_rr];
	   `r read_data_rr   <= read_data_r;
	   `r read_data_rrr  <= read_data_rr;
	   `r read_data_rrrr <= read_data_rrr;
	   `w out_o = read_data_rrrr;	   
        end
      else if(USE_BRAM && DELAY_BRAM_IMPLEMENTATION_TYPE=="XILINX_OPTIMIZED")
        begin : gen_blk_use_bram_xilinx_optimized

           initial $display("Using Xilinx-Optimized BRAM DELAY for DELAY=%0d", DELAY);
           
           localparam EXTRA_ADDRESS_REGISTERS  =  1;
           localparam BRAM_LENGTH              =  DELAY-1;
           localparam COUNTER_BITS             =  $clog2(BRAM_LENGTH);
           
	   reg [COUNTER_BITS-1:0] counter_r    =  0;
	   reg [COUNTER_BITS-1:0] counter_rr   =  0;

           wire [BIT_WIDTH-1:0]   out_w;
           reg [BIT_WIDTH-1:0]   out_r;

	   `r counter_r  <= (counter_r==DELAY-2-bram_based_ram_register_delay(BRAM_LENGTH)) ? 0 : counter_r+1;
	   `r counter_rr <= counter_r;

           bram_based_ram 
             #( 
                .LENGTH                   (  BRAM_LENGTH              ), 
                .WIDTH                    (  BIT_WIDTH                ),
                .EXTRA_ADDRESS_REGISTERS  (  EXTRA_ADDRESS_REGISTERS  ),
                .READ_WRITE_MODE          (  "READ_FIRST"             )
              ) 
           bram_array
             (
              .clk             (  clk             ),
              .resetn          (  1'b1            ),
              .write_enable_i  (  1'b1            ),
              .write_addr_i    (  counter_rr      ),
              .write_data_i    (  in_i            ),
              .read_addr_i     (  counter_rr      ),
              .read_data_ro    (  out_w           )
              );

           `r out_r <= out_w;
           `w out_o = out_r;

        end
      else if(USE_BRAM)
        begin : gen_blk_no_bram_implementation
           error_module no_bram_implementation_type_defined(clk);
        end
      else
        begin : gen_blk_default_delay_implementation

	   //
	   // bit-bit-bit implementation, commented out.  This implementation is great
	   // for synthesis, but is very slow in simulation.
	   //
	   //for(i=0; i<BIT_WIDTH; i=i+1)
	   //  begin
           //  reg [DELAY-1:0] buffer_r = 0;
           //   `r buffer_r <= { buffer_r[DELAY-2:0], in_i[i] };
           //   `w out_o[i] = buffer_r[DELAY-1];
           //  end


	   //
	   // Register implementation.  This implementation is great for simulation,
	   // but doesn't synthesize as delay lines, and seems to lock up synthesis.
	   // Thus, commented out.
	   //
	   //reg [BIT_WIDTH-1:0] buffer_r [DELAY-1:0];
	   //
	   //initial begin
	   //   integer j;
	   //   for(j=DELAY-1; j>=0; j=j-1)
           //     buffer_r[j] = 0;
	   //end
	   //
	   //for(i=DELAY-1; i>0; i=i-1)
	   //  `r buffer_r[i] <= buffer_r[i-1];
           //
	   //`r buffer_r[0] <= in_i;
           //
	   //`w out_o = buffer_r[DELAY-1];

	   //
	   // This (original) code shifts by BIT_WIDTH bits in large bit array.
	   // It exceeds allowable Vivado register sizes for large BxBFFTs.
	   // This prompted me to try the two implementations above, although
	   // this implementation works well for both simulation and synthesis.
	   // The ultimate solution is to increase the bit width limit of
	   // Vivado.  This is done with this TCL command:
	   //
	   //     set_param synth.elaboration.rodinMoreOptions "rt::set_parameter var_size_limit 16777216"
	   //
	   // Repeat this below, since the // comments may be removed from customer releases.
	   //


	   //
	   // This is almost the best, but there is a verilator bug that causes it not to work,
	   // apparently when the high bit is set on the input.  To solve this, shift down
	   // rather than up.
	   //
	   //reg [TOTAL_BITS-1:0] buffer_r = 0;
           //`r buffer_r <= { buffer_r[SHIFTED_BITS-1:0], in_i };
           //`w out_o = buffer_r[SHIFTED_BITS +: BIT_WIDTH];

	   
	   /* 
	    * If Vivado won't compile the next statement, use this TCL command in Vivado:
	    *
	    *   set_param synth.elaboration.rodinMoreOptions "rt::set_parameter var_size_limit 16777216"
	    * 
	    */
	   reg [TOTAL_BITS-1:0] buffer_r = 0;
           `r buffer_r <= { in_i, buffer_r[TOTAL_BITS-1:BIT_WIDTH] };
           `w out_o = buffer_r[0 +: BIT_WIDTH];
	   
	end

   endgenerate
            
endmodule // delay

/* verilator lint_on WIDTHEXPAND  */
/* verilator lint_on WIDTHTRUNC  */
/* verilator lint_on LITENDIAN  */
