/*
 *  Quadbike 2
 *  Copyright (C) 2026 'Diminished'

 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.

 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.

 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/

#include "goertzel.h"
#include "util.h"
#include "qbio.h"
#include "audio.h"

#include "qb_types.h"

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>


#define QB_POWER_NO_SIGNAL 0.001f






#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2 || defined QB_VECTORS_MSVC_AVX512
qb_err_t qb_goertzel_oversample_vector_new(qb_vec_buf_t* vbuf,
                                            s32_t winlen,
                                            qb_vec_f_t* two_cos_omega_0_vec,
                                            qb_vec_f_t* two_cos_omega_1_vec,
                                            qb_vec_f_t* scaler_for_2400_power_vec,
                                            float* max_power0_out,
                                            float* max_power1_out,
                                            float* max_powerX_out,
                                            float* max_confidence_out,
                                            qb_vec_buf_t oversampled_vec[2], // [0], [1] are allocated here
                                            u8_t dp) { // display_progress

    s64_t n;
    qb_err_t e;
    qb_vec_f_t zeros;
    float scalar_scaler; // ;-]

    *max_power0_out = 0.0;
    *max_power1_out = 0.0;
    *max_powerX_out = 0.0;
    *max_confidence_out = 0.0;

    e = QB_E_OK;

    e = qb_vec_buf_init(&(oversampled_vec[0]), QB_VECTYPE_FLOAT, NULL, vbuf->linear_len, 0, 0, 0, NULL); // don't need left or right pieces
    if (QB_E_OK != e) { return e; }
    e = qb_vec_buf_init(&(oversampled_vec[1]), QB_VECTYPE_FLOAT, NULL, vbuf->linear_len, 0, 0, 0, NULL); // don't need left or right pieces
    if (QB_E_OK != e) {
        qb_vec_buf_finish(&(oversampled_vec[0]));
        return e;
    }

#ifdef QB_VECTORS_GCC_CLANG
    scalar_scaler = (*scaler_for_2400_power_vec)[0];
#else // MSVC_AVX2, MSVC_AVX512
    scalar_scaler = ((float*)scaler_for_2400_power_vec)[0];
#endif

    printf("    Scaling 2400 Hz Goertzel power by fixed value of %.1f.\n",
        scalar_scaler);
    printf("    Computing Goertzel oversamples (window %d): ", winlen);
    fflush(stdout); // MacOS
    qb_show_meter(dp);

#ifdef QB_VECTORS_GCC_CLANG
    for (n = 0; n < QB_VECSIZE; n++) {
        zeros[n] = 0.0f;
    }
#elif defined QB_VECTORS_MSVC_AVX2
    zeros = _mm256_setzero_ps();
#elif defined QB_VECTORS_MSVC_AVX512 // 2.0.4
    zeros = _mm512_setzero_ps();
#endif

    // OK, so just a note about the limits here;
    // we are going to read from slightly before and after zero_ix
    // because of the window (so, we read off the start and end of piece_len).
    // we need to make sure the data in these regions is valid.

    // if we didn't pre-filter, we have something analogous to this:

    //  0  0  0      0  1  2  3  4  5  6  7      8  9 10 11 12 13 14
    //  5  6  7      8  9 10 11 12 13 14 15     16 17 18 19 20 21 22
    // 13 14 15     16 17 18 19 20 21 22 23     24 25 26 27 28 29 30
    // 21 22 23     24 25 26 27 28 29 30 31     32 33 34 35 36 37 38
    // 29 30 31     32 33 34 35 36 37 38 39     40 41 42 43 44 45 46
    // 37 38 39     40 41 42 43 44 45 46 47     48 49 50 51 52 53 54
    // 45 46 47     48 49 50 51 52 53 54 55     56 57 58 59 60 61 62
    // 53 54 55     56 57 58 59 60 61 62 63      0  0  0  0  0  0  0
    //              ^ zero_ix
    //       |-----------|
    //      goertzel window

    // in this case we have several hundred samples' worth of valid
    // left hand side. the window at 44.1K is 38 samples wide, so
    // we need ~19 samples available on the left hand side, so we
    // should be able to read valid data off the left hand side just
    // fine. the right hand side is also fine; we have a thousand
    // samples available there.

    // if we DID do the prefilter, we have something like this
    // (XX is invalid data)

    // XX XX XX     XX XX  0  1  2  3  4  5      6  7  8  9 10 11 12
    // XX XX XX      6  7  8  9 10 11 12 13     14 15 16 17 18 19 20
    // XX XX XX     14 15 16 17 18 19 20 21     22 23 24 25 26 27 28
    // XX XX XX     22 23 24 25 26 27 28 29     30 31 32 33 34 35 36
    // XX XX XX     30 31 32 33 34 35 36 37     38 39 40 41 42 43 44
    // XX XX XX     38 39 40 41 42 43 44 45     46 47 48 49 50 51 52
    // XX XX XX     46 47 48 49 50 51 52 53     54 55 56 57 58 59 60
    // XX XX XX     54 55 56 57 58 59 60 61     62 63  0  0  0  0  0
    //                     ^ zero_ix
    //                |--------|
    //              goertzel window

    // the original left hand side is now invalid (XX). zero_ix will
    // have been advanced something like 73 samples at 44.1K to
    // compensate for the filter delay, but the data to the left of
    // zero_ix is still valid in this region, and again we only
    // need 19 samples for the Goertzel window, so this will be fine too,
    // without having to manually copy values onto the left hand side.

    // the only slight complication is the first row, where
    // we will be reading invalid data for the first 19 samples or so.
    // this won't actually matter in practice, but let's be good and
    // make sure that region of vbuf is properly zeroed out.

//#ifdef QB_SANITY // 2.0.4
//    if ((winlen / 2) > vbuf->zero_ix) {
//      qb_hide_meter(dp, 1);
//      fprintf(QB_ERR, "B: winlen / 2 (%d) > vbuf->zero_ix (%lld)\n", winlen / 2, vbuf->zero_ix);
//      return QB_E_BUG;
//    }
//#endif

#ifdef QB_SANITY // 2.0.4
    if (0 == vbuf->alloc) {
      qb_hide_meter(dp, 1);
      fprintf(QB_ERR, "B: vbuf->alloc is 0!\n");
      return QB_E_BUG;
    }
#endif
  
  for (n = vbuf->zero_ix - (winlen / 2); n < (winlen / 2); n++) {
    //printf("%u\n", n);
    //if (n > 0) {
#ifdef QB_VECTORS_GCC_CLANG
      vbuf->v.f[n][0] = 0.0f; // only write zeros to the FIRST element of these vectors
#else // MSVC_AVX2, MSVC_AVX512
      ((float*)(vbuf->v.f + n))[0] = 0.0f;
#endif
    //}
  }

  for (n = vbuf->zero_ix;
       // centre window around n
       // stop when window end moves beyond srcbuf
       //((n + 1 + (winlen/2)) < srclen); // && ((n + 1 + (w2_len/2)) < srclen);
       n < (vbuf->zero_ix + vbuf->piece_len);
       n++) {
  
    s64_t sn;
    u8_t i;
    qb_vec_f_t p0v, p1v;
#if defined QB_VECTORS_MSVC_AVX2 || defined QB_VECTORS_MSVC_AVX512
    qb_vec_f_t confi_v;
#endif
    
    sn = n - (winlen / 2); // window start position
      
    // don't need this check any more because of the
    // pre_overlap and post_overlap sections within vbuf:
    //if (sn >= 0) { // sample number will start out < 0 because of window, so ignore
  
    qb_goertzel_pwr_vector_new (vbuf->v.f + sn, winlen, &zeros, two_cos_omega_0_vec, &p0v); // 1200 Hz
    qb_goertzel_pwr_vector_new (vbuf->v.f + sn, winlen, &zeros, two_cos_omega_1_vec, &p1v); // 2400 Hz
    
#ifdef QB_VECTORS_GCC_CLANG
    p1v *= *scaler_for_2400_power_vec;
#elif defined QB_VECTORS_MSVC_AVX2
    p1v = _mm256_mul_ps(p1v, *scaler_for_2400_power_vec);
#elif defined QB_VECTORS_MSVC_AVX512 // 2.0.4
    p1v = _mm512_mul_ps(p1v, *scaler_for_2400_power_vec);
#endif

    //if (n < srclen) {
    oversampled_vec[0].v.f[n - vbuf->zero_ix] = p0v;
    oversampled_vec[1].v.f[n - vbuf->zero_ix] = p1v;
    
    // not sure how to do this in a compatible way
    // without resorting to a dumb loop ... the fabsf() part is
    // probably possible, but finding the max value in a vector
    // I don't know about
    for (i=0; i < QB_VECSIZE; i++) {
      float p0, p1, confi;
#ifdef QB_VECTORS_GCC_CLANG
      p0 = p0v[i];
      p1 = p1v[i];
      confi = fabsf(p0 - p1);
      //confi_v[i] = confi;
#elif defined QB_VECTORS_MSVC_AVX2 || defined QB_VECTORS_MSVC_AVX512
      p0 = ((float *)(&p0v))[i];
      p1 = ((float *)(&p1v))[i];
      confi = fabsf(p0 - p1);
      ((float *)(&confi_v))[i] = confi;
#endif // vectorstuff
      if (p0    > *max_power0_out)     { *max_power0_out     = p0;    }
      if (p1    > *max_power1_out)     { *max_power1_out     = p1;    }
      if (confi > *max_confidence_out) { *max_confidence_out = confi; }
    }

    qb_update_meter (dp, n - vbuf->zero_ix, vbuf->piece_len, 1.0f, 0);

  } // next sample
  
  qb_hide_meter(dp, 0);
  printf("done.\n");
  
  *max_powerX_out = *max_power0_out;
  if (*max_power1_out > *max_powerX_out) {
    *max_powerX_out = *max_power1_out;
  }
  
  if ( (*max_powerX_out) < QB_POWER_NO_SIGNAL ) {
    fprintf(QB_ERR, "E: No signal detected anywhere. Unable to continue.\n");
    return QB_E_NO_SIGNAL;
  }
  
  printf("      Maximum 0-value power:   %f\n", *max_power0_out);
  printf("      Maximum 1-value power:   %f\n", *max_power1_out);
//    printf("      Maximum of either value: %f\n", *max_powerX_out);
  printf("      Maximum confidence:      %f\n", *max_confidence_out);
  
  //}
  
  return e;
  
}
#else // no vectorisation
qb_err_t qb_goertzel_oversample  (float *src_f,
                                  s64_t srclen,
                                  s32_t winlen,
                                  float omega0, // 1200 Hz in rads/sample
                                  float omega1, // 2400 Hz in rads/sample
                                  float scaler_for_2400_power,
                                  float *max_power0_out,
                                  float *max_power1_out,
                                  float *max_powerX_out,
                                  float *max_confidence_out,
                                  float *oversampled[2],
                                  u8_t dp) { //display_progress) {

  s64_t n;
  float p0, p1, confi;
  qb_err_t e;
  
  *max_power0_out     = 0.0;
  *max_power1_out     = 0.0;
  *max_powerX_out     = 0.0;
  *max_confidence_out = 0.0;
  
  e = QB_E_OK;
  
  // prevent potential alloc underflow
  if (srclen >= QB_MAX_IPFILE_SAMPLES) {
    fprintf(QB_ERR, "B: %s called with illegal srclen (%lld).\n", QB_FUNC_M, srclen);
    return QB_E_BUG;
  }
  
  printf("    Scaling 2400 Hz Goertzel power by fixed value of %.1f.\n", scaler_for_2400_power);
  printf("    Computing Goertzel oversamples (window %d): ", winlen);

  fflush(stdout); // MacOS

  qb_show_meter(dp);
  
  memset(oversampled[0], 0, sizeof(float) * srclen);
  memset(oversampled[1], 0, sizeof(float) * srclen);
  
  for (n=0;
       // centre window around n
       // stop when window end moves beyond srcbuf
       ((n + 1 + (winlen/2)) < srclen); // && ((n + 1 + (w2_len/2)) < srclen);
       n++) {
  
    s64_t sn;
    
    p0 = 0.0f;
    p1 = 0.0f;
    
    sn = n - (winlen / 2); // window start position
      
    if (sn >= 0) { // sample number will start out < 0 because of window, so ignore
  
      qb_goertzel_pwr (src_f + sn, winlen, omega0, &p0); // 1200 Hz
      qb_goertzel_pwr (src_f + sn, winlen, omega1, &p1); // 2400 Hz
      
      p1 *= scaler_for_2400_power;
      
    }

    if (n < srclen) {
      oversampled[0][n] = p0;
      oversampled[1][n] = p1;
      confi = fabsf(p0-p1);
      if (p0    > *max_power0_out)     { *max_power0_out     = p0;    }
      if (p1    > *max_power1_out)     { *max_power1_out     = p1;    }
      if (confi > *max_confidence_out) { *max_confidence_out = confi; }
    }

    qb_update_meter (dp, n, srclen, 1.0f, 0);

  } // next sample
  
  qb_hide_meter (dp, 0);
  
  printf("done.\n");
  
  if (QB_E_OK == e) {
  
    *max_powerX_out = *max_power0_out;
    if (*max_power1_out > *max_powerX_out) {
      *max_powerX_out = *max_power1_out;
    }
    
    if ( (*max_powerX_out) < QB_POWER_NO_SIGNAL ) {
      fprintf(QB_ERR, "E: No signal detected anywhere. Unable to continue.\n");
      return QB_E_NO_SIGNAL;
    }
    
    printf("      Maximum 0-value power:   %f\n", *max_power0_out);
    printf("      Maximum 1-value power:   %f\n", *max_power1_out);
    //printf("      Maximum of either value: %f\n", *max_powerX_out);
    printf("      Maximum confidence:      %f\n", *max_confidence_out);
  
  }
  
  return e;
  
}
#endif // no vectorisation


// exported now for qb_compute_speed_by_goertzel
// in process.c
qb_err_t qb_goertzel_pwr (float *in,
                          s64_t sequence_len,
                          float omega_rads_per_sample,
                          float *power_out) {

  s64_t n;
  float omega;
  float sn1, sn2;
  float two_cos_omega;
  
  omega = omega_rads_per_sample;

/*
#ifdef QB_SANITY
  if ((omega > QB_PI) || (omega < 0.0f)) {
    //fprintf(QB_ERR, "goertzel: BUG: bad omega (%lf)\n", omega);
    fprintf(QB_ERR, "B: %s: bad omega (%f)\n", QB_FUNC_M, omega);
    return QB_E_BUG;
  }
#endif */
  
  two_cos_omega = 2.0f * cosf(omega);
  
  sn1 = 0.0f;
  sn2 = 0.0f;
    
  for (n=0; n < sequence_len; n++) {
  
    //double s;
    float s;
    
    s = in[n];

// windowing
//~ if ( (n == 0) || (n == (sequence_len - 1))) {
  //~ s *= 0.1;
//~ }
//~ if ( (n == 1) || (n == (sequence_len - 2))) {
  //~ s *= 0.5;
//~ }
//~ if ( (n == 2) || (n == (sequence_len - 3))) {
  //~ s *= 0.9;
//~ }
  
    s = s + (two_cos_omega * sn1) - sn2;
    
    sn2 = sn1;
    sn1 = s;

  }
  
  *power_out = (sn2 * sn2) + (sn1 * sn1) - (two_cos_omega * sn1 * sn2);
  
  return QB_E_OK;
  
}



#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2 || defined QB_VECTORS_MSVC_AVX512
qb_err_t qb_goertzel_pwr_vector_new (qb_vec_f_t *vin,
                                    s64_t sequence_len,
                                    qb_vec_f_t *zeros, // so stupid
                                    qb_vec_f_t *two_cos_omega_vec,
                                    qb_vec_f_t *power_out_vec) {

  s64_t n;
  qb_vec_f_t sn1v, sn2v;
#if defined QB_VECTORS_MSVC_AVX2 || defined QB_VECTORS_MSVC_AVX512
  qb_vec_f_t a, c; // b;
#endif
  
  sn1v = *zeros;
  sn2v = *zeros;
    
  for (n=0; n < sequence_len; n++) {
  
    qb_vec_f_t v;
    
    v = vin[n];

#ifdef QB_VECTORS_GCC_CLANG
    v = v + (*two_cos_omega_vec * sn1v) - sn2v;
#elif defined QB_VECTORS_MSVC_AVX2
    v = _mm256_sub_ps(_mm256_fmadd_ps(*two_cos_omega_vec, sn1v, v), sn2v);
#elif defined QB_VECTORS_MSVC_AVX512 // 2.0.4
    v = _mm512_sub_ps(_mm512_fmadd_ps(*two_cos_omega_vec, sn1v, v), sn2v);
#endif
    
    sn2v = sn1v;
    sn1v = v;

  }
  
#ifdef QB_VECTORS_GCC_CLANG
  *power_out_vec = (sn2v * sn2v) + (sn1v * sn1v) - (*two_cos_omega_vec * sn1v * sn2v);
#elif defined QB_VECTORS_MSVC_AVX2
  c = _mm256_mul_ps(_mm256_mul_ps(*two_cos_omega_vec, sn1v), sn2v);
  //b = _mm256_mul_ps(sn1v, sn1v);
  //a = _mm256_sub_ps(b, c);
  a = _mm256_fmsub_ps(sn1v, sn1v, c);
  *power_out_vec = _mm256_fmadd_ps(sn2v, sn2v, a);
#elif defined QB_VECTORS_MSVC_AVX512 // 2.0.4
  c = _mm512_mul_ps(_mm512_mul_ps(*two_cos_omega_vec, sn1v), sn2v);
  a = _mm512_fmsub_ps(sn1v, sn1v, c);
  *power_out_vec = _mm512_fmadd_ps(sn2v, sn2v, a);
#endif
  
  return QB_E_OK;
  
}
#endif // vectorised



void qb_compute_omegas (float tape_speed,
                        u32_t sample_rate,
                        float *omega_1200_out,
                        float *omega_2400_out) {
  *omega_1200_out = (QB_FREQ_1 * QB_PI * tape_speed) / (sample_rate * 0.5f);
  *omega_2400_out = (QB_FREQ_2 * QB_PI * tape_speed) / (sample_rate * 0.5f);
}



void qb_compute_window_lengths (float tape_speed,
                                s32_t sample_rate,
                                u32_t *winlen_out) {
  *winlen_out = (u32_t) (roundf (((1.0f / (tape_speed * QB_FREQ_1)) * sample_rate)));
}

