//
// Copyright (C) 2021 Ross Martin
//
// Versions
//
// 2021.01.18  Coding begun

`timescale 1ps / 1ps

//
// These definitions should never be used.  FFT must be defined externally.  The rest
// must either be defined externally by chaning the source with sed or passed as parameters.
//
//`define FFT_MODULE      "bad_fft.v"
`ifndef FILTER_FILENAME
 `define FILTER_FILENAME  "bad.dat"
`endif

`ifndef NUM_TAPS
 `define NUM_TAPS         0
`endif

`ifndef COMPLEX_FFT_SIZE
 `define COMPLEX_FFT_SIZE 0
`endif

`ifndef COMPLEX_PPC
 `define COMPLEX_PPC      0
`endif

/* verilator lint_off WIDTHTRUNC      */
/* verilator lint_off WIDTHEXPAND     */
/* verilator lint_off STMTDLY         */
/* verilator lint_off REALCVT         */
/* verilator lint_off PINCONNECTEMPTY */



`define w assign
`define r always@(posedge clk)

//
//  This module is similar to pfb_fir.v, in that both perform a PFB.  One difference is that pfb_fir provides a
//  continuous streaming PFB, while this provides only a single time slice of the PFB output.  Thus this is great
//  for computing a spectrum, and it uses much less memory.  However, it is more limited in that it can't do any
//  time processing of the outputs.  This spectrum_pfb also is the full deal, where pfb_fir leaves out the FFT.
//  This is so that the capture memories can be reused to hold the FFT output, saving memory, for larger possible
//  spectrums.  This module also accepts multiple input streams.  This allows the PFB fir table to be stored only
//  once, saving memory.  It also allows spectrums to be captured simultaneously or with a programmable delay
//  between them, which allows for computation of transfer functions.  Note that the PFB fir table has two ports
//  and thus can be read at different positions a maximum of 2 times.  Thus there are two groups of input signals,
//  the EARLY group and the LATE group.  The EARLY group begins computing a spectrum immediately after reset is
//  released.  The LATE group holds off by the count of the early_to_late_delay counter.
//
//  All EARLY input data streams share a single sync, as do all LATE data streams.  There is an early_to_late_delay 
//  input, which tells how many clocks after the sync+RESET_ADVANCEMENT to delay before capture begins on the LATE
//  data stream.  Processing proceeds, and when the complex spectrum is calculated it is stored back in the original 
//  capture memory.  When EARLY processing is complete, an early_done output is asserted.  When LATE processing is
//  complete, a late_done output is asserted.  After the done flag is asserted, data can be read from the associated
//  capture memories by supplying an external address.  The address consists of data stream number plus address plus
//  PPC index.   Since output is complex, imaginary parts have odd PPC indexes.  Output goes through a reordering
//  buffer, so it's in normal FFT order.  That is, PPCindex=0 in the first memory address has the real part of the 
//  first sample  (sample zero).  PPCindex=0 in the second memory address has the real part of the second sample 
//  (sample 1).  PPC=2 in the first memory address (address 0) has the real part of frequency bin
//  COMPLEX_FFT_SIZE/COMPLEX_PPC.
//
//  The process begins again when the sync is again asserted, which starts a new capture.  The data stream number
//  starts at 0 up to NUM_STREAMS_EARLY-1, and then begins with the LATE streams.
//
//  Issues:  Currently only tested on REAL input data streams.  The FFT speed is currently tied to the input speed,
//  and there is no reason this should be the case.
//
//  Note that the read input may be on a lower rate clock.  Nothing is done here to synchronize clocks.  If the
//  lower rate clock is slow enough, this should just work, since any oscillations from incorrectly timed signals
//  hitting a register should die out in two clock periods of the higher rate clock, which may be before the
//  data is ever clocked in on the next lower-rate clock.  In this case the read_sync_in and read_sync_out that
//  are used to match read delay would be unneeded.  Also note that registers on the read path are kept to a
//  minimum to facilitate access with a lower-rate clock.  Thus additional registers might be needed at higher
//  rate clocks, and if there is asynchronous clock access then the registers should be marked asynchronous.
//
    
module spectrum_pfb
  #(
    // These parameters are independent of the filter and FFT, and thus can be defined anytime
    parameter   REAL_WIDTH             = 27,
    parameter   NUM_STREAMS_EARLY      = 1,
    parameter   NUM_STREAMS_LATE       = 1,
    parameter   STATIC_SHIFTS          = 64'hAAAAAAAAA,
    parameter   NUM_SYNC_READ          = 1,
    parameter   RESET_ADVANCEMENT      = 16,

    // These parameters are tied to the filter, and thus can only be changed as a group from the top level
    // They may also be predefined at this level.  Note that the filter PPC must agree with the FFT PPC,
    // So the filter PPC can't be changed without changing the FFT.
    parameter   FILTER_FILENAME        = "filter__96000fft__48000in_per_fft__7panels__18bits__0dB__12fppc.dat",
    parameter   NUM_TAPS               = 7,

    // These parameters are tied to the FFT, and can only be changed if the FFT is changed, which
    // cannot be changed from the parent but only via a `define.
    parameter   COMPLEX_FFT_SIZE       = 48000,
    parameter   COMPLEX_PPC            = 6,
    parameter   REAL                   = 1,
    
    parameter   FILTER_COEF_WIDTH      = 18,
    parameter   NUM_STREAMS            = NUM_STREAMS_EARLY + NUM_STREAMS_LATE,
    parameter   REAL_FFT_SIZE          = 2*COMPLEX_FFT_SIZE,
    parameter   FILTER_COEF_FRAC_BITS  = FILTER_COEF_WIDTH-2,
    parameter   REAL_PPC               = 2*COMPLEX_PPC,
    parameter   COEF_PPC               = REAL ? REAL_PPC : COMPLEX_PPC,
    parameter   CLOCKS_PER_FFT         = COMPLEX_FFT_SIZE / COMPLEX_PPC,
    parameter   RAM_LENGTH             = CLOCKS_PER_FFT,
    parameter   FILTER_LENGTH          = NUM_TAPS * CLOCKS_PER_FFT * COEF_PPC,
    parameter   FILTER_CLOCKS          = NUM_TAPS * CLOCKS_PER_FFT,
    parameter   STREAM_SELECT_WIDTH    = $clog2(NUM_STREAMS),
    parameter   PPC_SELECT_WIDTH       = $clog2(REAL_PPC),
    parameter   FILTER_CLOCK_WIDTH     = $clog2(FILTER_CLOCKS),
    parameter   FILTER_ADDR_WIDTH      = $clog2(FILTER_LENGTH),
    parameter   RAM_ADDR_WIDTH         = $clog2(RAM_LENGTH),
    parameter   DATA_DELAY             = 9,
    parameter   READ_EXTRA_DELAY       = 5,
    parameter   ACCUM_LOOP_DELAY       = 14,
    parameter   COEF_DELAY             = 1,
    parameter   ACCUM_DELAY            = 2,
    parameter   ZERO_ACCUM_DELAY       = 8,

    parameter  OUTPUT_URAM_DESIRABILITY_PERCENT  = 100,  // Scale of 0 to 100
    parameter  OUTPUT_BRAM_DESIRABILITY_PERCENT  = 50    // Scale of 0 to 100
   )
   (
    input 			    clk,
    input 			    resetn,

    input 			    run_i,
    
    input signed [REAL_WIDTH-1:0]   in_data_early_i [0:NUM_STREAMS_EARLY-1][0:REAL_PPC-1],
    input signed [REAL_WIDTH-1:0]   in_data_late_i [0:NUM_STREAMS_LATE-1][0:REAL_PPC-1],

    input [31:0] 		    late_capture_delay_i,

    output [NUM_STREAMS-1:0] 	    spectrum_done_o,

    input [STREAM_SELECT_WIDTH-1:0] read_stream_i,
    input [RAM_ADDR_WIDTH-1:0] 	    read_address_i,
    input [PPC_SELECT_WIDTH-1:0]    read_ppc_i,
    output signed [REAL_WIDTH-1:0]  read_data_o,
    input [NUM_SYNC_READ-1:0] 	    read_sync_in_i, // Delay between read_sync_in_i and read_sync_out_o matches read delay
    output [NUM_SYNC_READ-1:0] 	    read_sync_out_o
   );

   genvar 			    m;

   //
   // Input data is assumed to be continuously streaming time.  No reset is necessary more than resetn.  A separate run_i
   // signal is included in addition to resetn.  It has the same effect.  resetn low resets everything, but so does run_i low.
   // bring both run_i high and resetn high to run.
   //
   
   //
   // States:  EARLY and LATE each have their own states.  They interact some when taking FFTs.
   //
   //    resetn asserted:                                 reset everything, set state to STATE_IDLE after release.
   //
   //    STATE_DELAY:    (LATE signals only) Enter when resetn or run_i is deasserted, exit when late_delay_count_r goes zero
   //                    Wait for delay to run out before beginning capture.
   //
   //    STATE_CAPTURE:  (EARLY) Enter when in_sync_i deasserted.  (LATE) Enter when late_delay_count_r goes zero.  Exit when 
   //                    capture_count_r reaches its end.  Capture signals, multiply by filter coefficients, accumulate into buffer, 
   //                    store modulo FFT_LENGTH.
   //
   //    STATE_TAKE_FFT: Also has substates counting off streams.  Enter when capture_count_r goes zero.  Start at stream 0.  Go to next
   //                    stream at FFT input as last input into FFT finishes.  Pause between EARLY and LATE until LATE capture_count_r is done.
   //                    exit state when all streams have had their FFTs.  Set output spectrum_done_o for each stream as FFT completes.  Store
   //                    FFT output data back in the buffer the input data came from.  This isn't a problem, since the FFT used has an
   //                    output buffer to do a bit reverse, and this avoids read/write conflicts on the input memory.
   //
   //    STATE_IDLE:     All FFTs complete, wait for run_i or in_sync_i to restart capture.
   //
   
   localparam STATE_IDLE     = 0;
   localparam STATE_DELAY    = 1;
   localparam STATE_CAPTURE  = 2;
   localparam STATE_TAKE_FFT = 3;

   genvar                           i;
   
   reg [NUM_STREAMS-1:0] 	    spectrum_done_r            = 0;

   reg                              late_capture_delay_zero_r  = 0;
   
   reg [1:0]                        early_state_r              = STATE_IDLE;
   reg [1:0]                        late_state_r               = STATE_IDLE;

   wire [1:0] 			    early_state_d;
   wire [1:0] 			    late_state_d;

   reg                              early_capture_done_r       = 0;
   reg                              early_capture_done_rr      = 0;
   reg                              early_fft_done_r           = 0;
   
   reg                              late_capture_done_r        = 0;
   reg                              late_capture_done_rr       = 0;
   reg                              late_delay_done_r          = 0;
   reg                              late_fft_done_r            = 0;


   `w spectrum_done_o = spectrum_done_r;
   
   `r early_state_r  <=  (!resetn) || (!run_i)                                   ?  STATE_DELAY     :
                         (late_state_r==STATE_DELAY)                             ?  STATE_CAPTURE   :
                         (early_state_r==STATE_CAPTURE) && early_capture_done_rr ?  STATE_TAKE_FFT  :
                         (early_state_r==STATE_TAKE_FFT) && early_fft_done_r     ?  STATE_IDLE      :
                         /**/                                                       early_state_r   ;

   `r late_state_r   <=  (!resetn) || (!run_i)                                   ?  STATE_DELAY     :
                         (late_state_r==STATE_DELAY) && late_delay_done_r        ?  STATE_CAPTURE   :
                         (late_state_r==STATE_CAPTURE) && late_capture_done_rr   ?  STATE_TAKE_FFT  :
                         (late_state_r==STATE_TAKE_FFT) && late_fft_done_r       ?  STATE_IDLE      :
                         /**/                                                       late_state_r    ;

   delay #(.DELAY(RESET_ADVANCEMENT), .BIT_WIDTH(2)) early_state_delay  (.clk(clk), .in_i(early_state_r),  .out_o(early_state_d));
   delay #(.DELAY(RESET_ADVANCEMENT), .BIT_WIDTH(2))  late_state_delay  (.clk(clk), .in_i(late_state_r),   .out_o(late_state_d));

   //
   // Stuff for STATE_DELAY
   //
   reg [31:0]                       late_capture_delay_r = 0;
   reg [31:0]                       late_delay_count_r   = 0;

   `r late_capture_delay_r <= late_capture_delay_i;

   `r late_capture_delay_zero_r <= (late_capture_delay_r==0);
   
   `r late_delay_count_r  <=  (!resetn) || (!run_i)    ?  late_capture_delay_r    :
                              (late_delay_count_r==0)  ?  0                       :
                              /**/                        late_delay_count_r - 1  ;

   `r late_delay_done_r <=  (late_delay_count_r==0);
   

   //
   // Stuff for STATE_CAPTURE
   //

   reg [FILTER_CLOCK_WIDTH-1:0]     early_capture_count_r          = 0;  // The capture count is also the filter address
   reg [FILTER_CLOCK_WIDTH-1:0]     late_capture_count_r           = 0;
   reg                              early_zero_accumulator_data_r  = 0;
   reg                              late_zero_accumulator_data_r   = 0;
   reg [RAM_ADDR_WIDTH-1:0]         early_accum_address_r          = 0;  // accum_address is the address into capture accumulator ram
   reg [RAM_ADDR_WIDTH-1:0]         late_accum_address_r           = 0;  // same address is read, then N clocks later written
   reg                              early_accum_address_last_r     = 0;
   reg                              late_accum_address_last_r      = 0;
   wire [RAM_ADDR_WIDTH-1:0]        early_accum_address_d;            // Delayed version for writing.
   wire [RAM_ADDR_WIDTH-1:0]        late_accum_address_d;
   reg                              early_accum_valid_r            = 0;
   wire                             early_accum_write_enable_w;
   reg                              late_accum_valid_r             = 0;
   wire                             late_accum_write_enable_w;
   wire                             early_zero_accumulator_data_d;
   wire                             late_zero_accumulator_data_d;
   reg 				    early_accum_start_r            = 0;
   reg 				    late_accum_start_r             = 0;
   
   `r early_capture_done_r   <= (early_capture_count_r==FILTER_CLOCKS-2);
   `r early_capture_done_rr  <= (early_capture_count_r==FILTER_CLOCKS-1);
   `r late_capture_done_r    <= (late_capture_count_r==FILTER_CLOCKS-2);
   `r late_capture_done_rr   <= (late_capture_count_r==FILTER_CLOCKS-1);
   
   `r early_capture_count_r <= (!resetn) || (!run_i)          ?  0                          :
                               (early_state_r!=STATE_CAPTURE) ?  0                          :
			       early_capture_done_r           ?  0                          :
                               /**/                              early_capture_count_r + 1  ;

   `r late_capture_count_r  <= (!resetn) || (!run_i)          ?  0                          :
                               (late_state_r!=STATE_CAPTURE)  ?  0                          :
			       late_capture_done_r            ?  0                          :
                               /**/                              late_capture_count_r + 1   ;

   // input is continuous, so checking for address before last will be right after one register delay.
   `r early_accum_address_last_r <= (early_accum_address_r==RAM_LENGTH-2);

   `r early_accum_start_r   <= (!resetn) || (!run_i)          ?   1                         :
                               (early_state_r!=STATE_CAPTURE) ?   1                         :
			       /**/                               0                         ;

   `r early_accum_address_r <= (!resetn) || (!run_i)          ?  0                          :
			       early_accum_start_r            ?  0                          :
                               early_accum_address_last_r     ?  0                          :
                               /**/                              early_accum_address_r + 1  ;

   `r early_accum_valid_r   <= (early_state_r==STATE_CAPTURE);

   `r early_zero_accumulator_data_r <= (early_capture_count_r<CLOCKS_PER_FFT);

   // input is continuous, so checking for address before last will be right after one register delay.
   `r late_accum_address_last_r <= (late_accum_address_r==RAM_LENGTH-2);

   `r late_accum_start_r    <= (!resetn) || (!run_i)          ?   1                         :
                               (late_state_r!=STATE_CAPTURE)  ?   1                         :
			       /**/                               0                         ;

   `r late_accum_address_r <= (!resetn) || (!run_i)           ?  0                         :
			       late_accum_start_r             ?  0                          :
                               late_accum_address_last_r      ?  0                          :
                               /**/                              late_accum_address_r + 1   ;

   `r late_accum_valid_r   <= (late_state_r==STATE_CAPTURE);
   
   `r late_zero_accumulator_data_r <= (late_capture_count_r<CLOCKS_PER_FFT);

   delay #(.DELAY(ACCUM_LOOP_DELAY),   .BIT_WIDTH(RAM_ADDR_WIDTH)) early_accum_addr_delay (.clk(clk), .in_i(early_accum_address_r), .out_o(early_accum_address_d));
   delay #(.DELAY(ACCUM_LOOP_DELAY),   .BIT_WIDTH(RAM_ADDR_WIDTH)) late_accum_addr_delay  (.clk(clk), .in_i(late_accum_address_r),  .out_o(late_accum_address_d));
   delay #(.DELAY(ACCUM_LOOP_DELAY-2), .BIT_WIDTH(1))              early_accum_we         (.clk(clk), .in_i(early_accum_valid_r),   .out_o(early_accum_write_enable_w));
   delay #(.DELAY(ACCUM_LOOP_DELAY-2), .BIT_WIDTH(1))              late_accum_we          (.clk(clk), .in_i(late_accum_valid_r),    .out_o(late_accum_write_enable_w));

   delay #(.DELAY(ZERO_ACCUM_DELAY),   .BIT_WIDTH(1))              early_zero_accum_delay (.clk(clk), .in_i(early_zero_accumulator_data_r), .out_o(early_zero_accumulator_data_d));
   delay #(.DELAY(ZERO_ACCUM_DELAY),   .BIT_WIDTH(1))              late_zero_accum_delay  (.clk(clk), .in_i(late_zero_accumulator_data_r),  .out_o(late_zero_accumulator_data_d));

   
   //
   // Stuff for STATE_TAKE_FFT
   //
   
   reg [STREAM_SELECT_WIDTH-1:0]    input_stream_r            = 0;
   wire [STREAM_SELECT_WIDTH-1:0]   input_stream_d;

   reg                              in_fft_state_r            = 0;
   reg                              fft_last_sample_r         = 0;

   reg [RAM_ADDR_WIDTH-1:0]         fft_count_r               = 0;
   wire [RAM_ADDR_WIDTH-1:0]        fft_read_address_w;

   reg                              fft_in_sync_r             = 0;
   wire 			    fft_in_sync_w; 			    
   wire 			    fft_in_sync_d; 			    

   // FFT Input

   // input_stream_r is synchronous with fft_count_r, so when fft_count_r wraps to 0 input_stream_r increments simultaneously
   `r input_stream_r <= (!resetn) || (!run_i)           ?  0                          :
                        (early_state_r==STATE_CAPTURE)  ?  0                          :
                        fft_last_sample_r               ?  input_stream_r + 1         :
                        /**/                               input_stream_r             ;

   `r fft_last_sample_r  <= (fft_count_r==CLOCKS_PER_FFT-2);         // 1 on sample CLOCKS_PER_FFT-1, or when no FFT in progress

   // in_fft_state_r is 1 for fft_count_r 0 through CLOCKS_PER_FFT-1.  On the next fft_count_r 0, it goes high to force fft_count_r
   // to stay 0, which holds fft_in_sync_r high.
   `r in_fft_state_r <= ((early_state_r==STATE_TAKE_FFT) && !early_fft_done_r) || ((late_state_r==STATE_TAKE_FFT) && !late_fft_done_r);

   // These done flags are 1 on the last sample of the associated FFT.  Then the FFT state changes on the first sample
   // of what would be the next FFT.  sync is asserted on this sample because it's the first sample.  in_fft_state_r changes on
   // the first sample.  Then on the second sample fft_count_r changes to CLOCKS_PER_FFT-2.
   `r early_fft_done_r <= (fft_count_r==CLOCKS_PER_FFT-2) && (input_stream_r==NUM_STREAMS_EARLY-1);
   `r late_fft_done_r  <= (fft_count_r==CLOCKS_PER_FFT-2) && (input_stream_r==NUM_STREAMS_EARLY+NUM_STREAMS_LATE-1);
   
   `r fft_count_r <= (!resetn) || (!run_i)                ?  0                        :
                     in_fft_state_r && fft_last_sample_r  ?  0                        :
                     in_fft_state_r                       ?  fft_count_r + 1          :
                     /**/                                    0                        ;

   `w fft_in_sync_w = (fft_count_r==0);  // Synchronous with fft_count_r and input_stream_r
   `r fft_in_sync_r <= fft_in_sync_w;    // This is further delayed by the MEM_READ_DELAY before going to the FFT

   // These read signals go to the RAM.  They are valid RESET_ADVANCEMENT clocks after fft_in_sync_r is high.  Delay is
   // RESET_ADVANCEMENT rather than RESET_ADVANCEMENT-1 because fft_in_sync_r lags behind them one clock.
   delay #(.DELAY(RESET_ADVANCEMENT), .BIT_WIDTH(STREAM_SELECT_WIDTH)) fft_stream_delay (.clk(clk), .in_i(input_stream_r), .out_o(input_stream_d));
   delay #(.DELAY(RESET_ADVANCEMENT), .BIT_WIDTH(RAM_ADDR_WIDTH))      fft_sync_delay   (.clk(clk), .in_i(fft_count_r),    .out_o(fft_read_address_w));
   bxb_sync_delay_2 #(.DELAY(2), .BIT_WIDTH(1))      fft_sync_align_delay   (.clk(clk), .resetn(resetn), .in_i(fft_in_sync_r),    .out_o(fft_in_sync_d));

   // FFT Output

   reg [NUM_STREAMS-1:0]            fft_write_r               = 0;
   wire [NUM_STREAMS-1:0] 	    fft_write_d;
   reg                              late_fft_write_r          = 0;
   reg [STREAM_SELECT_WIDTH-1:0]    output_stream_r           = 0;
   reg [RAM_ADDR_WIDTH-1:0]         fft_write_count_r         = 0;
   wire [RAM_ADDR_WIDTH-1:0]        fft_out_address_w;
   wire                             fft_out_sync_w;
   wire                             fft_out_sync_d;
   reg                              fft_out_sync_dr           = 1;
   reg                              fft_write_last_sample_r   = 0;
   

   bxb_sync_delay_2 #(.DELAY(RESET_ADVANCEMENT-1), .BIT_WIDTH(1)) rb_reset (.clk(clk), .resetn(resetn), .in_i(fft_out_sync_w), .out_o(fft_out_sync_d));

   `r fft_out_sync_dr <= fft_out_sync_d;

   generate
      for(m=0; m<NUM_STREAMS; m=m+1)
        begin
           `r fft_write_r[m] <= ((!fft_out_sync_d) || (!fft_out_sync_dr)) && (((output_stream_r==m)&&!fft_write_last_sample_r) || ((output_stream_r==m-1)&&fft_write_last_sample_r));
        end
   endgenerate

   delay #(.DELAY(0), .BIT_WIDTH(NUM_STREAMS)) fft_write_align_delay  (.clk(clk), .in_i(fft_write_r), .out_o(fft_write_d));

   //
   // output_stream_r, fft_write_count_r, and r_fft_write_r are all synchronous
   //
   `r fft_write_count_r <= fft_out_sync_dr ?  0 : fft_write_count_r + 1;

   `r fft_write_last_sample_r <= (fft_write_count_r==CLOCKS_PER_FFT-2);
   
   `r output_stream_r <= (!resetn) || (!run_i)           ?  0                           :
                         (early_state_r==STATE_CAPTURE)  ?  0                           :
                         fft_write_last_sample_r         ?  output_stream_r + 1         :
                         /**/                               output_stream_r             ;

   generate
      for(m=0; m<NUM_STREAMS; m=m+1)
	begin
	   `r spectrum_done_r[m] <= ( (!resetn) || (!run_i)                            ?  0                   :
				      fft_write_last_sample_r && (m==output_stream_r)  ?  1                   :
				      /**/                                                spectrum_done_r[m]  );
	end
   endgenerate

   /////////////////////////////////////////////////////////////////////////
   //
   //  PFB Coefficient memories
   //
   //    Get the coefficients from the tables.  Use the fact we
   //    can read twice simultaneously from the same ROM to give us 
   //    two different delays in the coefficients.
   //
   /////////////////////////////////////////////////////////////////////////

   reg [FILTER_CLOCK_WIDTH-1:0]    coef_rom_addr_0_r       = 0;
   reg [FILTER_CLOCK_WIDTH-1:0]    coef_rom_addr_0_rr      = 0;
   //reg [FILTER_CLOCK_WIDTH-1:0]    coef_rom_addr_0_rrr     = 0;

   reg [FILTER_CLOCK_WIDTH-1:0]    coef_rom_addr_1_r       = 0;
   reg [FILTER_CLOCK_WIDTH-1:0]    coef_rom_addr_1_rr      = 0;
   //reg [FILTER_CLOCK_WIDTH-1:0]    coef_rom_addr_1_rrr     = 0;

   wire [FILTER_COEF_WIDTH-1:0]    coef_early_w [0:COEF_PPC-1+1];
   wire [FILTER_COEF_WIDTH-1:0]    coef_late_w [0:COEF_PPC-1+1];

   reg [FILTER_COEF_WIDTH-1:0]     coef_early_r [0:COEF_PPC-1+1];
   reg [FILTER_COEF_WIDTH-1:0]     coef_late_r [0:COEF_PPC-1+1];

   // Add sufficient registers to pipeline the read address all across the chip.

   `r coef_rom_addr_0_r      <= early_capture_count_r;
   `r coef_rom_addr_0_rr     <= coef_rom_addr_0_r;
   //`r coef_rom_addr_0_rrr    <= coef_rom_addr_0_rr;

   `r coef_rom_addr_1_r      <= late_capture_count_r;
   `r coef_rom_addr_1_rr     <= coef_rom_addr_1_r;
   //`r coef_rom_addr_1_rrr    <= coef_rom_addr_1_rr;

   //initial begin $display("%s: clocks=%0d ppc=%0d", FILTER_FILENAME, FILTER_CLOCKS, COEF_PPC); $finish; end

   spectrum_pfb_two_port_rom #( .ROM_FILENAME       ( FILTER_FILENAME          ),
                                .ROM_FILE_OFFSET    ( 0                        ),
                                .ROM_FILE_LENGTH    ( FILTER_CLOCKS            ),
                                .ROM_ITEM_WIDTH     ( FILTER_COEF_WIDTH        ),
                                .ROM_ITEM_NUMBER    ( COEF_PPC                 ),
                                .ROM_LENGTH         ( FILTER_CLOCKS            ))
   coef_rom     ( .clk                ( clk                      ),
                  .addr_0_i           ( coef_rom_addr_0_rr       ),
                  .addr_1_i           ( coef_rom_addr_1_rr       ),
                  .data_0_ro          ( coef_early_w             ),
                  .data_1_ro          ( coef_late_w              ));

   generate
      for(i=0; i<COEF_PPC; i=i+1)
        begin
           `r coef_early_r[i] <= coef_early_w[i];
           `r coef_late_r[i]  <= coef_late_w[i];
        end
   endgenerate
   
`ifdef DEBUG
   //
   // For testing
   //
   wire [FILTER_COEF_WIDTH-1:0]      coef_early_0_w;
   wire [FILTER_COEF_WIDTH-1:0]      coef_early_1_w;
   wire [FILTER_COEF_WIDTH-1:0]      coef_late_0_w;
   wire [FILTER_COEF_WIDTH-1:0]      coef_late_1_w;
   `w coef_early_0_w = coef_early_w[0];
   `w coef_early_1_w = coef_early_w[1];
   `w coef_late_0_w = coef_late_w[0];
   `w coef_late_1_w = coef_late_w[1];
`endif




   
   /////////////////////////////////////////////////////////////////////////
   //
   //  Data memories.  Separate groups for EARLY vs LATE, and separate
   //                  for each STREAM.
   //
   //           In STATE_CAPTURE, these act as accumulators of the
   //           product of filter_coefs * input_data.
   //
   //           In STATE_TAKE_FFT, data is streamed out into the FFT
   //           and then streamed back in.
   //
   //           In STATE_IDLE, these are read from an external address
   //           with write disabled.
   //
   /////////////////////////////////////////////////////////////////////////

   reg [STREAM_SELECT_WIDTH-1:0]     read_stream_r                  = 0;
   reg [RAM_ADDR_WIDTH-1:0]          read_address_r                 = 0;
   reg [PPC_SELECT_WIDTH-1:0] 	     read_ppc_r                     = 0;
   reg signed [REAL_WIDTH-1:0]       dpr_read_data_each_stream_r [0:NUM_STREAMS-1][0:REAL_PPC-1];
   reg signed [REAL_WIDTH-1:0]       accumulator_out_r  [0:NUM_STREAMS-1][0:REAL_PPC-1];
   reg [STREAM_SELECT_WIDTH-1:0]     fft_mem_select_r               = 0;
   wire [STREAM_SELECT_WIDTH-1:0]    fft_mem_select_d_pre;
   wire [STREAM_SELECT_WIDTH-1:0]    fft_mem_select_d;
   reg signed [REAL_WIDTH-1:0]       mem_to_fft_r  [0:REAL_PPC-1];
   reg signed [REAL_WIDTH-1:0]       fft_out_data_r  [0:REAL_PPC-1];
   wire signed [REAL_WIDTH-1:0]      fft_out_data_w  [0:REAL_PPC-1];
   reg 				     fft_in_sync_mem_delayed_r      = 1;
   reg [NUM_SYNC_READ-1:0] 	     read_sync_in_r                 = 0;
   reg [NUM_SYNC_READ-1:0] 	     read_sync_in_rr                = 0;
   reg [NUM_SYNC_READ-1:0] 	     read_sync_out_r                = 0;

   `r read_sync_in_r  <= read_sync_in_i;
   `r read_sync_in_rr <= read_sync_in_r;
   `r read_ppc_r      <= read_ppc_i;
   `r read_stream_r   <= read_stream_i;
   `r read_address_r  <= read_address_i;

   delay #(.DELAY(RESET_ADVANCEMENT+2), .BIT_WIDTH(1))  fft_memsel_align_delay (.clk(clk), .in_i(fft_mem_select_r), .out_o(fft_mem_select_d_pre));

   generate

      initial
	begin
	   integer ii;
	   integer mm;
	   for(mm=0; mm<NUM_STREAMS; mm=mm+1)
	     for(ii=0; ii<REAL_PPC; ii=ii+1)
	       begin
		  accumulator_out_r[mm][ii]           = 0;
		  dpr_read_data_each_stream_r[mm][ii] = 0;
	       end
	end

      initial
	begin
	   integer ii;
	   for(ii=0; ii<REAL_PPC; ii=ii+1)
	     mem_to_fft_r[ii] = 0;
	end
      
      for(i=0; i<REAL_PPC; i=i+1)
        `r fft_out_data_r[i] <= fft_out_data_w[i];


      for(m=0; m<NUM_STREAMS_EARLY; m=m+1)
        begin

           reg [RAM_ADDR_WIDTH-1:0]         dpr_read_address_r                  = 0;
           reg [RAM_ADDR_WIDTH-1:0]         dpr_read_address_rr                 = 0;

           reg [RAM_ADDR_WIDTH-1:0]         dpr_write_address_r                 = 0;
           reg [RAM_ADDR_WIDTH-1:0]         dpr_write_address_rr                = 0;
           
           reg                              dpr_write_enable_r                  = 0;
           reg                              dpr_write_enable_rr                 = 0;
           
           reg                              early_accum_write_enable_r          = 0;

           reg [REAL_WIDTH-1:0]             dpr_write_data_r[0:REAL_PPC-1];
           reg signed [REAL_WIDTH-1:0]      dpr_write_data_rr[0:REAL_PPC-1];

	   initial
	     begin
		integer ii;

		for(ii=0; ii<REAL_PPC; ii=ii+1)
		  begin
                     dpr_write_data_r[ii]  = 0;
                     dpr_write_data_rr[ii] = 0;
		  end
	     end

           
           `r early_accum_write_enable_r <= early_accum_write_enable_w;
           
           `r dpr_read_address_r   <=  (early_state_r==STATE_CAPTURE)   ?  early_accum_address_r  :
                                       (early_state_d==STATE_TAKE_FFT)  ?  fft_read_address_w     :
                                       /**/                                read_address_r         ;           
           `r dpr_read_address_rr  <=  dpr_read_address_r;

           `r fft_mem_select_r     <=  input_stream_r;
           
           `r dpr_write_address_r  <=  early_accum_write_enable_r ?  early_accum_address_d : fft_write_count_r;
           `r dpr_write_address_rr <=  dpr_write_address_r;
           
           `r dpr_write_enable_r   <=  early_accum_write_enable_r || (((early_state_r==STATE_TAKE_FFT) || (early_state_r==STATE_IDLE)) && fft_write_d[m]);
           `r dpr_write_enable_rr  <=  dpr_write_enable_r;

           for(i=0; i<REAL_PPC; i=i+1)
             begin
                `r dpr_write_data_r[i]     <= early_accum_write_enable_r ? accumulator_out_r[m][i] : fft_out_data_r[i];
                `r dpr_write_data_rr[i]    <= dpr_write_data_r[i];
             end
           
           wire signed [REAL_WIDTH-1:0] dpr_read_data_this_ram_w [0:REAL_PPC-1];

           wire [NUM_SYNC_READ+STREAM_SELECT_WIDTH:0]  fft_sync_data_in_w;
           wire [NUM_SYNC_READ+STREAM_SELECT_WIDTH:0]  fft_sync_data_out_w;

           `w fft_sync_data_in_w = (m==0) ? { read_sync_in_rr, fft_mem_select_d_pre, fft_in_sync_d } : 0;
           if(m==0)
             begin : label_0
                `w fft_mem_select_d            = fft_sync_data_out_w[STREAM_SELECT_WIDTH:1];
                `r fft_in_sync_mem_delayed_r  <= !resetn ? 1 : fft_sync_data_out_w[0];
		`r read_sync_out_r            <= fft_sync_data_out_w[STREAM_SELECT_WIDTH+NUM_SYNC_READ:STREAM_SELECT_WIDTH+1];
             end
           
           dual_port_ram_async_s #( .RAM_ITEM_WIDTH  ( REAL_WIDTH                           ),
                                  .RAM_ITEM_NUMBER ( REAL_PPC                             ),
                                  .RAM_LENGTH      ( RAM_LENGTH                           ),
                                  .NUM_SYNC        ( 1+STREAM_SELECT_WIDTH+NUM_SYNC_READ  ),
				  .USE_ULTRA_RAM   ( 1                                    ))
           dpr                  ( .clk_w           ( clk                                  ),
                                  .write_enable_i  ( dpr_write_enable_rr                  ),
                                  .write_addr_i    ( dpr_write_address_rr                 ),
                                  .write_data_i    ( dpr_write_data_rr                    ),
                                  .clk_r           ( clk                                  ),
                                  .resetn_r        ( resetn                               ),
                                  .read_addr_i     ( dpr_read_address_rr                  ),
                                  .read_data_ro    ( dpr_read_data_this_ram_w             ),
                                  .in_sync_i       ( fft_sync_data_in_w                   ),
                                  .out_sync_o      ( fft_sync_data_out_w                  ));

           for(i=0; i<REAL_PPC; i=i+1)
             begin
                `r dpr_read_data_each_stream_r[m][i] <= dpr_read_data_this_ram_w[i];
             end
           
`ifdef DEBUG
           wire                              write_enable;
           wire [RAM_ADDR_WIDTH-1:0]         write_addr;
           wire signed [REAL_WIDTH-1:0]      value_write_ppc_0;
           wire signed [REAL_WIDTH-1:0]      value_write_ppc_1;
           wire [RAM_ADDR_WIDTH-1:0]         read_addr;
           wire signed [REAL_WIDTH-1:0]      value_read_ppc_0;

           `w write_enable      = dpr_write_enable_rr;
           `w write_addr        = dpr_write_address_rr;
           `w value_write_ppc_0 = dpr_write_data_rr[0];
           `w value_write_ppc_1 = dpr_write_data_rr[1];
           `w read_addr         = dpr_read_address_rr;
           `w value_read_ppc_0  = dpr_read_data_this_ram_w[0];
`endif

        end // for (m=0; m<NUM_STREAMS_EARLY; m=m+1)
      
      
      for(m=0; m < NUM_STREAMS_LATE; m = m + 1)
        begin

           reg [RAM_ADDR_WIDTH-1:0]         dpr_read_address_r                  = 0;
           reg [RAM_ADDR_WIDTH-1:0]         dpr_read_address_rr                 = 0;

           reg [RAM_ADDR_WIDTH-1:0]         dpr_write_address_r                 = 0;
           reg [RAM_ADDR_WIDTH-1:0]         dpr_write_address_rr                = 0;
           
           reg                              dpr_write_enable_r                  = 0;
           reg                              dpr_write_enable_rr                 = 0;
           
           reg                              late_accum_write_enable_r           = 0;

           reg [REAL_WIDTH-1:0]             dpr_write_data_r[0:REAL_PPC-1];
           reg signed [REAL_WIDTH-1:0]      dpr_write_data_rr[0:REAL_PPC-1];

           `r late_accum_write_enable_r <= early_accum_write_enable_w;
           
           `r dpr_read_address_r   <=  (late_state_r==STATE_CAPTURE)   ?  late_accum_address_r   :
                                       (late_state_d==STATE_TAKE_FFT)  ?  fft_read_address_w     :
                                       /**/                               read_address_r         ;           
           `r dpr_read_address_rr  <=  dpr_read_address_r;

           `r dpr_write_address_r  <=  late_accum_write_enable_r ?  late_accum_address_d : fft_write_count_r;
           `r dpr_write_address_rr <=  dpr_write_address_r;
           
           `r dpr_write_enable_r   <=  late_accum_write_enable_r || (((late_state_r==STATE_TAKE_FFT) || (late_state_r==STATE_IDLE)) && fft_write_d[m+NUM_STREAMS_EARLY]);
           `r dpr_write_enable_rr  <=  dpr_write_enable_r;

           for(i=0; i<REAL_PPC; i=i+1)
             begin
                `r dpr_write_data_r[i]     <= late_accum_write_enable_r ? accumulator_out_r[m+NUM_STREAMS_EARLY][i] : fft_out_data_r[i];
                `r dpr_write_data_rr[i]    <= dpr_write_data_r[i];
             end
           
           wire signed [REAL_WIDTH-1:0] dpr_read_data_this_ram_w [0:REAL_PPC-1];

           dual_port_ram_async_s #( .RAM_ITEM_WIDTH  ( REAL_WIDTH                  ),
                                  .RAM_ITEM_NUMBER ( REAL_PPC                    ),
                                  .RAM_LENGTH      ( RAM_LENGTH                  ),
                                  .NUM_SYNC        ( 1                           ),
				  .USE_ULTRA_RAM   ( 1                           ))
           dpr                  ( .clk_w           ( clk                         ),
                                  .write_enable_i  ( dpr_write_enable_rr         ),
                                  .write_addr_i    ( dpr_write_address_rr        ),
                                  .write_data_i    ( dpr_write_data_rr           ),
                                  .clk_r           ( clk                         ),
                                  .resetn_r        ( resetn                      ),
                                  .read_addr_i     ( dpr_read_address_rr         ),
                                  .read_data_ro    ( dpr_read_data_this_ram_w    ),
                                  .in_sync_i       ( 1'b0                        ),
                                  .out_sync_o      (                             ));

           for(i=0; i<REAL_PPC; i=i+1)
             begin
                `r dpr_read_data_each_stream_r[NUM_STREAMS_EARLY+m][i] <= dpr_read_data_this_ram_w[i];
             end

`ifdef DEBUG
           wire                              write2_enable;
           wire [RAM_ADDR_WIDTH-1:0]         write2_addr;
           wire signed [REAL_WIDTH-1:0]      value2_write_ppc_0;
           wire signed [REAL_WIDTH-1:0]      value2_write_ppc_1;
           wire [RAM_ADDR_WIDTH-1:0]         read2_addr;
           wire signed [REAL_WIDTH-1:0]      value2_read_ppc_0;

           `w write2_enable      = dpr_write_enable_rr;
           `w write2_addr        = dpr_write_address_rr;
           `w value2_write_ppc_0 = dpr_write_data_rr[0];
           `w value2_write_ppc_1 = dpr_write_data_rr[1];
           `w read2_addr         = dpr_read_address_rr;
           `w value2_read_ppc_0  = dpr_read_data_this_ram_w[0];
`endif
           
        end
      

      for(i=0; i<REAL_PPC; i=i+1)
        begin
           `r mem_to_fft_r[i] <= dpr_read_data_each_stream_r[fft_mem_select_d][i]; 
           `w read_data_o = dpr_read_data_each_stream_r[read_stream_r][read_ppc_r];
        end

      `w read_sync_out_o = read_sync_out_r;

   endgenerate

   


      
   /////////////////////////////////////////////////////////////////////////
   //
   //  The Muliply/Accumulator
   //
   //           Multiply the data by the coefficients and add into the
   //           partial sum held by the buffer.
   //
   //           Sum of all taps is maximum 1.0, so zero overflow can be
   //           guaranteed in the add if the binary point position is
   //           maintained.
   //
   /////////////////////////////////////////////////////////////////////////

   localparam FIRST_NONFRACTIONAL_BIT  = FILTER_COEF_FRAC_BITS;
   localparam ONE_HALF_BIT             = FIRST_NONFRACTIONAL_BIT - 1;

   reg signed [REAL_WIDTH-1:0]       accumulator_in_r   [0:NUM_STREAMS-1][0:REAL_PPC-1];

   generate


      for(m=0; m<NUM_STREAMS_EARLY; m=m+1)
        for(i=0; i<REAL_PPC; i=i+1)
          begin
             `r accumulator_in_r[m][i] <= early_zero_accumulator_data_d ? 0 : dpr_read_data_each_stream_r[m][i];
          end
      for(m=0; m<NUM_STREAMS_LATE; m=m+1)
        for(i=0; i<REAL_PPC; i=i+1)
          begin
             `r accumulator_in_r[m+NUM_STREAMS_EARLY][i] <= early_zero_accumulator_data_d ? 0 : dpr_read_data_each_stream_r[m+NUM_STREAMS_EARLY][i];
          end
      
      
      for(m=0; m<NUM_STREAMS_EARLY; m=m+1)
        begin
           
           for(i=0; i<REAL_PPC; i=i+1)
             begin

                localparam coef_index = REAL ? i : i/2;
           
           
                wire signed [REAL_WIDTH-1:0]          A_w;
                wire signed [FILTER_COEF_WIDTH-1:0]   B_w;
                reg signed [REAL_WIDTH-1:0]           A_r = 0;
                reg signed [FILTER_COEF_WIDTH-1:0]    B_r = 0;
                reg signed [47:0]                     M_r;
                reg signed [47:0]                     P_r;
                reg signed [47:0]                     sum_r;
                wire signed [REAL_WIDTH-1:0]          accumulator_in_d;
                reg signed [REAL_WIDTH-1:0]           in_data_early_r;
                wire signed [REAL_WIDTH-1:0]          result_w;

                `r in_data_early_r <= in_data_early_i[m][i];

                               
                delay #(.DELAY(DATA_DELAY),   .BIT_WIDTH(REAL_WIDTH))        data_delay  (.clk(clk), .in_i(in_data_early_r),            .out_o(A_w));
                delay #(.DELAY(COEF_DELAY),   .BIT_WIDTH(FILTER_COEF_WIDTH)) coef_delay  (.clk(clk), .in_i(coef_early_r[coef_index]),   .out_o(B_w));
                delay #(.DELAY(ACCUM_DELAY),  .BIT_WIDTH(REAL_WIDTH))        accum_delay (.clk(clk), .in_i(accumulator_in_r[m][i]),     .out_o(accumulator_in_d));
                
                `r A_r <= A_w;
                `r B_r <= B_w;

                `r M_r <= A_r * B_r;
                `r P_r <= accumulator_in_d << FIRST_NONFRACTIONAL_BIT;
                          
                
                `r sum_r <= M_r + P_r + (1<<ONE_HALF_BIT);  // Rounding constant of 1/2 in REAL_WIDTH bits

		`w result_w = sum_r[FIRST_NONFRACTIONAL_BIT +: REAL_WIDTH];
                `r accumulator_out_r[m][i] <= result_w;

`ifdef DEBUG
		//`w M_w_array[i][m] = M_r;  
		if(i==0 && m==0)
		  begin	
		     wire signed [REAL_WIDTH-1:0] 	  Accin_0_0;
		     wire signed [REAL_WIDTH-1:0] 	  A_r_0_0;
		     wire signed [FILTER_COEF_WIDTH-1:0]  B_r_0_0;
		     wire signed [47:0] 		  M_w_0_0; 
		     wire signed [REAL_WIDTH-1:0] 	  Accout_0_0;

		     `w Accin_0_0 = accumulator_in_d;
		     `w A_r_0_0 = A_r;
		     `w B_r_0_0 = B_r;
		     `w M_w_0_0 = M_r;
		     `w Accin_0_0 = accumulator_out_r[m][i];
		  end
`endif

                //`r
                //  if(m<NUM_TAPS/2)
                //    $display("ppc=%d, tap=%d, A_r=%d, B_r=%d B_w=%d coef_1_w[%d][%d]=%d this_coef_w=%d", i, m, A_r, B_r, B_w, m, i, coef_1_w[m][i], this_coef_w);
                //  else
                //    $display("ppc=%d, tap=%d, A_r=%d, B_r=%d B_w=%d coef_1_w[%d][%d]=%d this_coef_w=%d", i, m, A_r, B_r, B_w, m, i, coef_1_w[NUM_TAPS-1-m][i], this_coef_w);
                
                
             end // for (m=0; m<NUM_TAPS; m=m+1)
        end


      for(m=0; m<NUM_STREAMS_LATE; m=m+1)
        begin
           
           for(i=0; i<REAL_PPC; i=i+1)
             begin

                localparam coef_index = REAL ? i : i/2;
           
           
                wire signed [REAL_WIDTH-1:0]          A_w;
                wire signed [FILTER_COEF_WIDTH-1:0]   B_w;
                reg signed [REAL_WIDTH-1:0]           A_r = 0;
                reg signed [FILTER_COEF_WIDTH-1:0]    B_r = 0;
                reg signed [47:0]                     M_r;
                reg signed [47:0]                     P_r;
                reg signed [47:0]                     sum_r;
                wire signed [REAL_WIDTH-1:0]          accumulator_in_d;
                reg signed [REAL_WIDTH-1:0]           in_data_late_r;
                wire signed [REAL_WIDTH-1:0]          result_w;

                `r in_data_late_r <= in_data_late_i[m][i];
                
                delay #(.DELAY(DATA_DELAY),   .BIT_WIDTH(REAL_WIDTH))        data_delay  (.clk(clk), .in_i(in_data_late_r),             .out_o(A_w));
                delay #(.DELAY(COEF_DELAY),   .BIT_WIDTH(FILTER_COEF_WIDTH)) coef_delay  (.clk(clk), .in_i(coef_late_r[coef_index]),    .out_o(B_w));
                delay #(.DELAY(ACCUM_DELAY),  .BIT_WIDTH(REAL_WIDTH))        accum_delay (.clk(clk), .in_i(accumulator_in_r[m+NUM_STREAMS_EARLY][i]),     .out_o(accumulator_in_d));
                
                `r A_r <= A_w;
                `r B_r <= B_w;

                `r M_r <= A_r * B_r;
                `r P_r <= accumulator_in_d << FIRST_NONFRACTIONAL_BIT;
                          
`ifdef DEBUG2
		`w M_w_array[i][m] = M_r;  
		if(i==0 && m==0)
		  begin
		     `w A_r_0_0 = A_r;
		     `w B_r_0_0 = B_r;
		     `w M_w_0_0 = M_r;
		  end
`endif
                
                `r sum_r <= M_r + P_r + (1<<ONE_HALF_BIT);  // Rounding constant of 1/2 in REAL_WIDTH bits

		`w result_w = sum_r[FIRST_NONFRACTIONAL_BIT +: REAL_WIDTH];
                `r accumulator_out_r[m+NUM_STREAMS_EARLY][i] <= result_w;

                //`r
                //  if(m<NUM_TAPS/2)
                //    $display("ppc=%d, tap=%d, A_r=%d, B_r=%d B_w=%d coef_1_w[%d][%d]=%d this_coef_w=%d", i, m, A_r, B_r, B_w, m, i, coef_1_w[m][i], this_coef_w);
                //  else
                //    $display("ppc=%d, tap=%d, A_r=%d, B_r=%d B_w=%d coef_1_w[%d][%d]=%d this_coef_w=%d", i, m, A_r, B_r, B_w, m, i, coef_1_w[NUM_TAPS-1-m][i], this_coef_w);
                
                
             end // for (m=0; m<NUM_TAPS; m=m+1)
        end

   endgenerate

`ifdef DEBUG3
   wire [REAL_WIDTH-1:0]         out_data_o_0_w;
   wire [REAL_WIDTH-1:0]         out_data_o_1_w;
   wire [REAL_WIDTH-1:0]         out_data_o_2_w;
   wire [REAL_WIDTH-1:0]         out_data_o_3_w;
   `w out_data_o_0_w = out_data_o[0];
   `w out_data_o_1_w = out_data_o[1];
   `w out_data_o_2_w = out_data_o[2];
   `w out_data_o_3_w = out_data_o[3];
`endif






   /////////////////////////////////////////////////////////////////////////
   //
   //  The FFT
   //
   //           Must be an nplex FFT.  Time-multiplexed to perform
   //           all the FFTs in the system.  Currently works at the
   //           same PPC as everything else.  This keeps things simple,
   //           but it could run slower to save resources at the cost
   //           of more compute time.
   //
   /////////////////////////////////////////////////////////////////////////

   wire                             bxbfft_out_sync_w;
   wire signed [REAL_WIDTH-1:0]     bxbfft_out_data_w  [0:REAL_PPC-1];


//`define FFT_BYPASS

`ifdef FFT_BYPASS
   `w fft_out_sync_w = fft_in_sync_mem_delayed_r;
   `w fft_out_data_w = mem_to_fft_r;
`else
   BxBFFT_48000_6_real_BxBIF_vector
                             #( .FORWARD                   ( 1                           ),
				.GAIN_CONTROL_STRATEGY     ( 10                          ),
				.GAIN_CONTROL_MARGIN_BITS  ( 0                           ),
				.USER_INITIAL_SHIFT        ( 0                           ),
                                .USER_STATIC_SHIFTS        ( 0                           ),
				.FULL_FORWARD_FLOW_CONTROL ( 0                           ),
				.FOLD_IN                   ( 0                           ),
				.FOLD_OUT                  ( 0                           ),
				.INPUT_ORDER               ( 0  /* full natural */       ),
				.OUTPUT_ORDER              ( 1  /* partially natural */  ),
                                .REAL_WIDTH                ( REAL_WIDTH                  ),
				.RESET_ADVANCEMENT         ( RESET_ADVANCEMENT           ),
                                .NUM_SYNC                  ( 1                           ),
                                //.STATIC_NUM_STAGES_ENABLED  Leave this parameter at default to enable all stages
                                .ENABLE_DYNAMIC_MVFFT      ( 0                           ),
                                .ENABLE_MONITORING         ( 0                           ))
   bxbfft_forward             ( .clk                       ( clk                         ),
                                .resetn                    ( resetn                      ),
                                .top_in_sync_i             ( fft_in_sync_mem_delayed_r   ),
                                .top_in_data_valid_i       ( 1'b1                        ),
                                .top_in_data_i             ( mem_to_fft_r                ),
                                .fft_shifts_i              ( 0                           ),
                                .num_stages_enabled_i      ( 0                           ),
                                .overflow_detect_reset_i   ( 0                           ),
                                .overflow_detect_o         (                             ),

				.top_in_address_sync_i     ( 1                           ),
				.top_in_address_o          (                             ),
				
                                .top_out_sync_o            ( fft_out_sync_w              ),
                                .top_out_address_o         (                             ),
                                .top_out_data_o            ( fft_out_data_w              ));
`endif
   
`ifdef NOTDEF
   BxBFFT_48000_6_real_BxBIF_vector
                             #( .RESET_ADVANCEMENT         ( RESET_ADVANCEMENT           ),
                                .REAL_WIDTH                ( REAL_WIDTH                  ),
                                .NUM_SYNC                  ( 1                           ),
                                .STATIC_SHIFTS             ( STATIC_SHIFTS               ),
                                .ENABLE_DYNAMIC_SHIFTING   ( 0                           ),
                                .ENABLE_DYNAMIC_MVFFT      ( 0                           ),
                                //.STATIC_NUM_STAGES_ENABLED  Leave this parameter at default to enable all stages
                                .ENABLE_MONITORING         ( 0                           ))
   bxbfft_forward             ( .clk                       ( clk                         ),
                                .resetn                    ( resetn                      ),
                                .in_sync_i                 ( 1'b1                        ),
                                .in_address_o              (                             ),
                                .in_sync_data_i            ( fft_in_sync_mem_delayed_r   ),
                                .in_data_i                 ( mem_to_fft_r                ),
                                .fft_shifts_i              (                             ),
                                .num_stages_enabled_i      (                             ),
                                .overflow_detect_reset_i   (                             ),
                                .overflow_detect_o         (                             ),
                                .out_sync_o                ( bxbfft_out_sync_w           ),
                                .out_address_o             ( bxbfft_out_address_w        ),
                                .out_data_o                ( bxbfft_out_data_w           ));
   
   output_buffer    #( .FFT_SIZE                   (  COMPLEX_FFT_SIZE                    ),
                       .POINTS_PER_CLOCK           (  COMPLEX_PPC                         ),
                       .RESET_ADVANCEMENT          (  RESET_ADVANCEMENT                   ),
                       .REAL_WIDTH                 (  REAL_WIDTH                          ),
                       .NUM_SYNC                   (  1                                   ),
		       .URAM_DESIRABILITY_PERCENT  (  OUTPUT_URAM_DESIRABILITY_PERCENT    ),
		       .BRAM_DESIRABILITY_PERCENT  (  OUTPUT_BRAM_DESIRABILITY_PERCENT    ))
   output_buffer     ( .clk                        (  clk                                 ),
                       .resetn                     (  resetn                              ),
                       .bxbfft_out_sync_i          (  bxbfft_out_sync_w                   ),
                       .bxbfft_out_address_i       (  bxbfft_out_address_w                ),
                       .bxbfft_out_data_i          (  bxbfft_out_data_w                   ),
                       .top_out_sync_o             (  fft_out_sync_w                      ),
                       .top_out_data_o             (  fft_out_data_w                      ));
`endif

   
`ifdef DEBUG
   wire signed [REAL_WIDTH-1:0]  fft_in_data_0;   
   wire signed [REAL_WIDTH-1:0]  fft_in_data_1;   
   wire signed [REAL_WIDTH-1:0]  fft_out_data_0;   
   wire signed [REAL_WIDTH-1:0]  fft_out_data_1;   
   wire signed [REAL_WIDTH-1:0]  bxbfft_out_data_0;   
   wire signed [REAL_WIDTH-1:0]  bxbfft_out_data_1;   
   `w fft_in_data_0   = mem_to_fft_r[0];
   `w fft_in_data_1   = mem_to_fft_r[1];
   `w fft_out_data_0  = fft_out_data_w[0];
   `w fft_out_data_1  = fft_out_data_w[1];
   `w bxbfft_out_data_0  = bxbfft_out_data_w[0];
   `w bxbfft_out_data_1  = bxbfft_out_data_w[1];
`endif

   // Need example output buffer here


endmodule

/* verilator lint_on WIDTHTRUNC      */
/* verilator lint_on WIDTHEXPAND     */
/* verilator lint_on STMTDLY         */
/* verilator lint_on REALCVT         */
/* verilator lint_on PINCONNECTEMPTY */
