/*
 *  Quadbike 2
 *  Copyright (C) 2026 'Diminished'

 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.

 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.

 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/

#include "build.h"

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2 || defined QB_VECTORS_MSVC_AVX512

#include "fir_vec2.h"

#include <string.h>
#include <stdio.h>

static void qb_fir_vec_out (qb_fir_vec_t *f, qb_vec_f_t *out);
static void qb_fir_vec_in (qb_fir_vec_t *f, qb_vec_f_t *input);

static void populate_vector_taps (const double *taps,
                                  qb_fir_vec_t *f,
                                  s32_t num_taps,
                                  s32_t delay);

qb_err_t qb_fir_vec_init (qb_fir_vec_t *f, u8_t type, u8_t rate) {

  u32_t i;
  
  memset(f, 0, sizeof(qb_fir_vec_t));

#ifdef QB_SANITY
  if ( ! qb_validate_sample_rate_ix (rate) ) {
    fprintf(QB_ERR, "B: %s: bad sample rate ix %u\n", QB_FUNC_M, rate);
    return QB_E_BUG;
  }
#endif
  
  
  // bandpass 400->3200 is for the -f option
  if (QB_FIR_TYPE_BP_0K4_3K2 == type) {
    if (QB_FIR_RATE_22K == rate) {
      populate_vector_taps(qb_fir_taps_bp_0k4_3k2_22k,
                           f,
                           QB_FIR_TAPS_BP_0K4_3K2_22K,
                           QB_FIR_SMPSDELAY_BP_0K4_3K2_22K);
    } else if (QB_FIR_RATE_44K == rate) {
      populate_vector_taps(qb_fir_taps_bp_0k4_3k2_44k,
                           f,
                           QB_FIR_TAPS_BP_0K4_3K2_44K,
                           QB_FIR_SMPSDELAY_BP_0K4_3K2_44K);
    } else if (QB_FIR_RATE_48K == rate) {
      populate_vector_taps(qb_fir_taps_bp_0k4_3k2_48k,
                           f,
                           QB_FIR_TAPS_BP_0K4_3K2_48K,
                           QB_FIR_SMPSDELAY_BP_0K4_3K2_48K);
    }
  // bandpass-2400 is applied to the extracted carrier
  // before it is fed to the PLL
  } else if (QB_FIR_TYPE_BP_2K4 == type) {
    if (QB_FIR_RATE_22K == rate) {
      populate_vector_taps(qb_fir_taps_bp_2k4_22k,
                           f,
                           QB_FIR_TAPS_BP_2K4_22K,
                           QB_FIR_SMPSDELAY_BP_2K4_22K);
    } else if (QB_FIR_RATE_44K == rate) {
      populate_vector_taps(qb_fir_taps_bp_2k4_44k,
                           f,
                           QB_FIR_TAPS_BP_2K4_44K,
                           QB_FIR_SMPSDELAY_BP_2K4_44K);
    } else if (QB_FIR_RATE_48K == rate) {
      populate_vector_taps(qb_fir_taps_bp_2k4_48k,
                           f,
                           QB_FIR_TAPS_BP_2K4_48K,
                           QB_FIR_SMPSDELAY_BP_2K4_48K);
    }
  } else {
    fprintf(QB_ERR, "B: %s: unknown filter type %u\n", QB_FUNC_M, type);
    return QB_E_BUG;
  }
  
  if (0 == f->num_taps) {
    fprintf (QB_ERR, "B: %s: bad FIR type %u for rate %u\n", QB_FUNC_M, type, rate);
    return QB_E_BUG;
  }
  
  for (i = 0; i < f->num_taps; i++) {
#ifdef QB_VECTORS_GCC_CLANG
    s32_t j;
    for (j=0; j < QB_VECSIZE; j++) {
      f->histories[i][j] = 0.0f;
    }
#elif defined QB_VECTORS_MSVC_AVX2
    f->histories[i] = _mm256_setzero_ps();
#elif defined QB_VECTORS_MSVC_AVX512 // 2.0.4
    f->histories[i] = _mm512_setzero_ps();
#endif // QB_VECTORS_MSVC_AVX512

  }
  
  f->last_index = 0;
  
  //~ printf("Initialised FIR filter; type %u, taps %u, delay %u.\n",
         //~ type, f->num_taps, f->delay);
  
  return QB_E_OK;
  
}


static void populate_vector_taps (const double *taps,
                                  qb_fir_vec_t *f,
                                  s32_t num_taps,
                                  s32_t delay) {
  s32_t i;
  f->num_taps = num_taps;
  f->delay = delay;
  for (i=0; i < num_taps; i++) {
#ifdef QB_VECTORS_GCC_CLANG
    s32_t j;
    for (j=0; j < QB_VECSIZE; j++) {
      f->taps[i][j] = taps[i];
    }
#elif defined QB_VECTORS_MSVC_AVX2
    f->taps[i] = _mm256_set1_ps((float) taps[i]);
#elif defined QB_VECTORS_MSVC_AVX512
    f->taps[i] = _mm512_set1_ps((float) taps[i]); // 2.0.4
#endif
  }
}



// FIXME: macro-ise?
static void qb_fir_vec_in (qb_fir_vec_t *f, qb_vec_f_t *input) {
  f->histories[f->last_index] = *input; // __builtin_convertvector(*input, qb_vec_d_t); // float -> double
  f->last_index++;
  if (f->last_index == f->num_taps) { // implement ring buffer
    f->last_index = 0;
  }
}

// FIXME: macro-ise?
static void qb_fir_vec_out (qb_fir_vec_t *f, qb_vec_f_t *out) { // double internally, float externally

  qb_vec_f_t acc;
  u32_t index;
  u32_t i;

  // zero accumulator
#ifdef QB_VECTORS_GCC_CLANG
  memset (&acc, 0, sizeof(qb_vec_f_t));
#elif defined QB_VECTORS_MSVC_AVX2
  acc = _mm256_setzero_ps();
#elif defined QB_VECTORS_MSVC_AVX512
  acc = _mm512_setzero_ps(); // 2.0.4
#endif
  
  index = f->last_index;
  
  for (i = 0; i < f->num_taps; i++) {
    index = (index != 0) ? (index - 1) : (f->num_taps - 1);
#ifdef QB_VECTORS_GCC_CLANG
    acc += f->histories[index] * f->taps[i];
#elif defined QB_VECTORS_MSVC_AVX2
    //acc += f->histories[index] * f->taps[i];
    acc = _mm256_fmadd_ps(f->histories[index], f->taps[i], acc);
#elif defined QB_VECTORS_MSVC_AVX512 // 2.0.4
    acc = _mm512_fmadd_ps(f->histories[index], f->taps[i], acc);
#endif
  }
  
  //*out = __builtin_convertvector(acc, qb_vec_f_t); // double -> float
  *out = acc;
  
}

#include "util.h"

void qb_fir_vec_run (qb_vec_buf_t *inout, qb_fir_vec_t *fir, u8_t dp) { // display_progress
  s64_t i;
  qb_show_meter(dp);
  for (i=0; i < inout->alloc; i++) {
    qb_fir_vec_in  (fir, inout->v.f + i);
    qb_fir_vec_out (fir, inout->v.f + i);
    qb_update_meter (dp, i, inout->alloc, 1.0f, 0);
  }
  qb_hide_meter(dp, 0);
  printf("done.\n");
  //inout->valid_ix += fir->num_taps;
  inout->zero_ix += fir->delay;
}

#endif // QB_VECTORS_NEW
