/*
 *  Quadbike 2
 *  Copyright (C) 2026 'Diminished'

 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.

 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.

 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/

#include "tapespeed.h"
#include "util.h"
#include "goertzel.h"

#include <stdio.h>
#include <math.h>
#include <stdlib.h>



//#define QB_GOERTZEL_TAPE_SPEED_SCAN_P0_THRESH 0.2




//u8_t qb_is_tape_speed_legal (double tape_speed) {
u8_t qb_is_tape_speed_legal (float tape_speed) {
  if (tape_speed < QB_TAPE_SPEED_MIN) {
    return 0;
  }
  if (tape_speed > QB_TAPE_SPEED_MAX) {
    return 0;
  }
  return 1;
}

qb_err_t qb_span_measure_speed (s32_t rate,
                                float *src_data,
                                s64_t len,
                                float freq_min,
                                float freq_max,
                                float freq_nom, // unused
                                float *speed_out) {
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
  return qb_span_measure_speed_vector (rate, src_data, len, freq_min, freq_max, freq_nom, speed_out);
#else
  return qb_span_measure_speed_scalar (rate, src_data, len, freq_min, freq_max, freq_nom, speed_out);
#endif
}

// this is prohibitively slow, so curtail scan of spans
// longer than this length; only scan a portion of them:
#define QB_SPAN_TAPESPEED_SCAN_MAX_LEN 10000


#include <string.h>


// FIXME? is this the best way to do this? there's a lot of
// processing needed to set up src_vec and two_cos_omegas
// every time.

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
qb_err_t qb_span_measure_speed_vector (s32_t rate,
                                       float *src_f, // scalar
                                       s64_t len,
                                       float freq_min,
                                       float freq_max,
                                       float freq_nominal,
                                       float *speed_out) {
                                       
  s32_t f;
  float best_power;
  s32_t best_freq;
  qb_err_t e;
  s64_t scan_start, scan_len;
  qb_vec_f_t *src_vec;
  qb_vec_f_t zeros;
#ifdef QB_VECTORS_MSVC_AVX2
  qb_vec_f_t twos;
#endif
  s64_t n;
  u8_t v;
  
  best_power = 0.0f;
  best_freq = 0;
  
#ifdef QB_SANITY
  if (len < 0) {
    fprintf(QB_ERR, "\nB: %s: len is negative (%lld)\n", QB_FUNC_M, len);
    return QB_E_BUG;
  }
#endif

  if (len > QB_SPAN_TAPESPEED_SCAN_MAX_LEN) {
    // curtail scan: scan the central portion only
    scan_start = (len - QB_SPAN_TAPESPEED_SCAN_MAX_LEN) / 2;
    scan_len = QB_SPAN_TAPESPEED_SCAN_MAX_LEN;
  } else {
    // for shorter spans, check the entire span
    scan_start = 0;
    scan_len = len;
  }

//printf("min = %f, max = %f\n", freq_min, freq_max);

  // duplicate the span samples across 8 rows:
  /*
  |d[0]| |d[1]| |d[2]|
  |d[0]| |d[1]| |d[2]|
  |d[0]| |d[1]| |d[2]|
  |d[0]| |d[1]| |d[2]| ...
  |d[0]| |d[1]| |d[2]|
  |d[0]| |d[1]| |d[2]|
  |d[0]| |d[1]| |d[2]|
  |d[0]| |d[1]| |d[2]|
  */
  
  src_vec = qb_malloc (sizeof(qb_vec_f_t) * scan_len);
  if (NULL == src_vec) {
    fprintf(QB_ERR, "E: Out of memory allocating vector buffer for span (%lld bytes).\n", sizeof(qb_vec_f_t) * scan_len);
    return QB_E_MALLOC;
  }
  for (n=0; n < scan_len; n++) {
#ifdef QB_VECTORS_GCC_CLANG
    for (v=0; v < QB_VECSIZE; v++) {
      src_vec[n][v] = src_f[scan_start + n];
    }
#else // MSVC_AVX2
    //src_vec[n] = _mm256_load_ps(src_f + scan_start + n);
    src_vec[n] = _mm256_set1_ps(src_f[scan_start + n]);
#endif
  }
  
  memset(&zeros, 0, sizeof(qb_vec_f_t)); // compilers are stupid
  
#ifdef QB_VECTORS_GCC_CLANG
  for (v=0; v < QB_VECSIZE; v++) {
    zeros[v] = 0.0f;
    //twos[v] = 2.0f;
  }
#else // MSVC_AVX2
  zeros = _mm256_setzero_ps();
  twos = _mm256_set1_ps(2.0f);
#endif

  for (f = (s32_t) roundf(freq_min); f <= freq_max; f += 8) {
  
    // try eight frequencies at once
    
    qb_vec_f_t powers;
    qb_vec_f_t two_cos_omegas;
    
    two_cos_omegas = zeros;
    
    // each vector element is 1 Hz faster than the previous one
    for (v=0; v < QB_VECSIZE; v++) {
#ifdef QB_VECTORS_GCC_CLANG
      two_cos_omegas[v] = cosf ((2.0f * QB_PI * (f + v)) / (float) rate);
#else // MSVC_AVX2
      ((float *) (&two_cos_omegas))[v] = cosf((2.0f * QB_PI * (f + v)) / (float) rate);
#endif
    }
    
#ifdef QB_VECTORS_GCC_CLANG
    two_cos_omegas *= 2.0f;
#else // MSVC_AVX2
    two_cos_omegas = _mm256_mul_ps (two_cos_omegas, twos);
#endif
    
    e = qb_goertzel_pwr_vector_new (src_vec, scan_len, &zeros, &two_cos_omegas, &powers);
    if (QB_E_OK != e) { break; }

#ifdef QB_VECTORS_GCC_CLANG
    for (v=0; v < QB_VECSIZE; v++) {
      if (powers[v] > best_power) {
        best_power = powers[v];
        best_freq = f + v;
      }
    }
#else
    //_mm256_store_ps(powers_scalar, powers);
    //powers_scalar = (float *)&powers;
    for (v = 0; v < QB_VECSIZE; v++) {
      float p;
      p = ((float *)&powers)[v];
      if (p > best_power) {
        best_power = p;
        best_freq = f + v;
      }
    }
#endif
    
  }
  
  qb_free(src_vec);
  
  *speed_out = best_freq / freq_nominal;
  
  return QB_E_OK;
  
}
#else // no vectors
qb_err_t qb_span_measure_speed_scalar (s32_t rate,
                                       float *src_data,
                                       s64_t len,
                                       float freq_min,
                                       float freq_max,
                                       float freq_nominal,
                                       float *speed_out) {

  s32_t f;
  float best_power;
  s32_t best_freq;
  qb_err_t e;
  s64_t scan_start, scan_len;

  best_power = 0.0f;
  best_freq = 0;

  if (len > QB_SPAN_TAPESPEED_SCAN_MAX_LEN) {
    // curtail scan: scan the central portion only
    scan_start = (len - QB_SPAN_TAPESPEED_SCAN_MAX_LEN) / 2;
    scan_len = QB_SPAN_TAPESPEED_SCAN_MAX_LEN;
  }
  else {
    // for shorter spans, check the entire span
    scan_start = 0;
    scan_len = len;
  }
  //printf("scan_start = %lld, scan_len = %lld\n", scan_start, scan_len);

  for (f = (s32_t)roundf(freq_min); f <= freq_max; f += 1) {
    float p;
    float omega;
    omega = (2.0f * QB_PI * f) / (float)rate;
    e = qb_goertzel_pwr (src_data + scan_start, scan_len, omega, &p);
    if (QB_E_OK != e) { return e; }
    if (p > best_power) {
      best_power = p;
      best_freq = f;
    }
  }

  *speed_out = best_freq / freq_nominal;

  return QB_E_OK;

}
#endif // no vectors


/*
qb_err_t qb_serialise_tape_speeds (qb_span_t *spans,
                                   s32_t num_spans,
                                   u8_t **buf_out,
                                   size_t *buflen_out) {
                                   
  s32_t sn;
  float *speeds;
  //size_t len;
  u8_t *buf;
  qb_err_t e;
  
  // need one 32-bit integer to encode num spans, then
  //len = sizeof(s32_t) + (sizeof(float) * num_spans);
  
  speeds = qb_malloc (sizeof(float) * num_spans);
  if (NULL == speeds) {
    fprintf(QB_ERR, "E: Could not allocate memory for serialised tape speeds buffer.\n");
    return QB_E_MALLOC;
  }
  
  for (sn=0; sn < num_spans; sn++) {
    qb_span_t *span;
    span = spans + sn;
    speeds[sn] = span->speed;
  }
  
  buf = (u8_t *) speeds;
  
  e = qb_zlib_compress (buf, sizeof(float) * num_spans, 0, buf_out, buflen_out); // 0 = don't use gzip encoding
  
  qb_free(speeds);
  buf = NULL;
  speeds = NULL;
  
  return e;
  
}
*/
