/*
 *  Quadbike 2
 *  Copyright (C) 2023 'Diminished'

 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.

 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.

 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/

#include "build.h"

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2

#include "vector.h"
#include "qb_err.h"
#include "util.h"

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include "quadbike.h" // for QB_MAX_IPFILE_SAMPLES

// VECSTUFFS:
// wikipedia on AVX:
/*
AVX uses sixteen YMM registers to perform a single instruction on multiple pieces of data (see SIMD).
Each YMM register can hold and do simultaneous operations (math) on:

eight 32-bit single-precision floating point numbers or
four 64-bit double-precision floating point numbers.

FOR MSVC:

e.g. /arch:SSE2
/arch:AVX2 ??

// note little endian, so values stored backwards
// _mm256_set1_ps(float a)                        -- broadcast
// _mm256_setzero_ps (void)                       -- broadcast all zeros
// _mm256_mul_ps (__m256 a, __m256 b)             -- multiply floatwise
// _mm256_fmadd_ps (__m256 a, __m256 b, __m256 c) -- (a * b) + c
// _mm256_add_ps(a,b)                             -- add
// _mm256_sub_ps(a,b)                             -- subtract
// _mm256_load_ps (float const * mem_addr)        -- load vector from scalar memory
// _mm256_store_ps (float * mem_addr, __m256 a)   -- store vector to scalar memory */

#if 0
qb_err_t qb_vec_test (void) {

  /*
  
  pre_overlap                                  post_overlap
   aka LHS                                       aka RHS
   |<-3->|      |<--- piece_len ---->|      |<------ 7 ------>|
   
   0  0  0      0  1  2  3  4  5  6  7      8  9 10 11 12 13 14
   5  6  7      8  9 10 11 12 13 14 15     16 17 18 19 20 21 22
  13 14 15     16 17 18 19 20 21 22 23     24 25 26 27 28 29 30
  21 22 23     24 25 26 27 28 29 30 31     32 33 34 35 36 37 38
  29 30 31     32 33 34 35 36 37 38 39     40 41 42 43 44 45 46
  37 38 39     40 41 42 43 44 45 46 47     48 49 50 51 52 53 54
  45 46 47     48 49 50 51 52 53 54 55     56 57 58 59 60 61 62
  53 54 55     56 57 58 59 60 61 62 63      0  0  0  0  0  0  0
                ^ zero_ix
  
  AFTER FILTERING THINGS WILL LOOK LIKE THIS
  (let's say filter delay is 2 samples)
  
                       next stage can use this
                       part as its input
  invalid            |--------------------------|
  XX XX XX     XX XX  0  1  2  3  4  5      6  7  8  9 10 11 12
  XX XX XX      6  7  8  9 10 11 12 13     14 15 16 17 18 19 20
  XX XX XX     14 15 16 17 18 19 20 21     22 23 24 25 26 27 28
  XX XX XX     22 23 24 25 26 27 28 29     30 31 32 33 34 35 36
  XX XX XX     30 31 32 33 34 35 36 37     38 39 40 41 42 43 44
  XX XX XX     38 39 40 41 42 43 44 45     46 47 48 49 50 51 52
  XX XX XX     46 47 48 49 50 51 52 53     54 55 56 57 58 59 60
  XX XX XX     54 55 56 57 58 59 60 61     62 63  0  0  0  0  0
               ^      ^
               |      |
          valid_ix   zero_ix
               |      |
               |------|
            we can consume
            this piece as
            part of the
            history section
            on the next filt0r
            
  
  
  
  */

  qb_err_t e;
  s64_t x,y;
  qb_vec_buf_t v;
  float d[64];
  
  for (x=0; x < 64; x++) { d[x] = (float) x; }
  
  e = qb_vec_buf_init (&v, QB_VECTYPE_FLOAT, d, 64, 3, 7, 1, "");
  if (QB_E_OK != e) { return e; }
  
  printf ("e = %d, v.alloc = %lld\n", e, v.alloc);
  
  for (y=0; y < QB_VECSIZE; y++) {
    for (x=0; x < v.alloc; x++) {
      printf ("%2lld ", (s64_t) v.v.f[x][y]);
    }
    printf("\n");
  }
  
  return QB_E_OK;
  
}
#endif // 0

// data_out is either a double or float double-pointer, and this function will alloc the buffer
qb_err_t qb_vec_buf_unswizzle (qb_vec_buf_t *v,
                               float **f_out,
                               //double **d_out,
                               u8_t dp, // display_progress
                               char *indent,
                               char *extra_text) {

  s64_t j;
  qb_err_t e;
  
  e = QB_E_OK;
  
  if (QB_VECTYPE_FLOAT == v->type) {
    if (NULL == f_out) {
      fprintf(QB_ERR, "B: %s: f_out is NULL\n", QB_FUNC_M);
      return QB_E_BUG;
    }
    *f_out = qb_malloc (sizeof(float) * v->piece_len * QB_VECSIZE);
    if (NULL == *f_out) { e = QB_E_MALLOC; }
  /*} else if (QB_VECTYPE_DOUBLE == v->type) {
    if (NULL == d_out) {
      fprintf(QB_ERR, "B: %s: d_out is NULL\n", QB_FUNC_M);
      return QB_E_BUG;
    }
    *d_out = qb_malloc (sizeof(double) * v->piece_len * QB_VECSIZE);
    if (NULL == *d_out) { e = QB_E_MALLOC; } */
  } else {
    fprintf(QB_ERR, "B: %s: bad vector buf type %u\n", QB_FUNC_M, v->type);
    return QB_E_BUG;
  }
  if (QB_E_OK != e) {
    fprintf(QB_ERR, "E: Out of memory allocating vector unswizzle buffer.\n");
    return e;
  }
  
  printf("%sUnswizzling%s: ", indent, extra_text);
  qb_show_meter(dp);
  
  if (QB_VECTYPE_FLOAT == v->type) {
    for (j=0; j < QB_VECSIZE; j++) {
      s64_t i;
      for (i = v->zero_ix; i < (v->zero_ix + v->piece_len); i++) {
#ifdef QB_VECTORS_GCC_CLANG
        (*f_out)[i + (j * v->piece_len) - v->zero_ix] = v->v.f[i][j];
#else
        (*f_out)[i + (j * v->piece_len) - v->zero_ix] = ((float *) &(v->v.f[i]))[j];
#endif
        qb_update_meter (dp,
                         (i - v->zero_ix) + (j * v->piece_len),
                         v->piece_len * QB_VECSIZE,
                         1.0f,
                         0);
      }
    }
  } /*else if (QB_VECTYPE_DOUBLE == v->type) {
    for (j=0; j < QB_VECSIZE; j++) {
      s64_t i;
      for (i = v->zero_ix; i < (v->zero_ix + v->piece_len); i++) {
        (*d_out)[i + (j * v->piece_len) - v->zero_ix] = v->v.d[i][j];
        qb_update_meter (dp,
                         (i - v->zero_ix) + (j * v->piece_len),
                         v->piece_len * QB_VECSIZE,
                         1.0f,
                         0.0f);
      }
    }
  } */
  
  qb_hide_meter(dp, 0);
  printf("done.\n");
  
  return e;
  
}

#include "util.h"

// for inspect file writing
qb_err_t qb_vec_buf_unswizzle_to_s16 (qb_vec_buf_t *v, s16_t **out, float scaler) {

  s64_t j;
  qb_err_t e;
  
  e = QB_E_OK;
  
  if (NULL == out) {
    fprintf(QB_ERR, "B: %s: out is NULL\n", QB_FUNC_M);
    return QB_E_BUG;
  }
  
  *out = qb_malloc (sizeof(s16_t) * v->piece_len * QB_VECSIZE);
  if (NULL == *out) {
    fprintf(QB_ERR, "E: Out of memory allocating vector unswizzle buffer.\n");
    return e;
  }
  
  if (QB_VECTYPE_FLOAT == v->type) {
    for (j=0; j < QB_VECSIZE; j++) {
      s64_t i;
      for (i = v->zero_ix; i < (v->zero_ix + v->piece_len); i++) {
        (*out)[i + (j * v->piece_len) - v->zero_ix] = qb_float_to_s16 (((float *) &(v->v.f[i]))[j] * scaler);
      }
    }
  } /*else if (QB_VECTYPE_DOUBLE == v->type) {
    for (j=0; j < QB_VECSIZE; j++) {
      s64_t i;
      for (i = v->zero_ix; i < (v->zero_ix + v->piece_len); i++) {
        (*out)[i + (j * v->piece_len) - v->zero_ix] = qb_float_to_s16(v->v.d[i][j] * scaler);
      }
    }
  } */
  
  return e;
  
}


// initialise, swizzle, prep duplicated overlapping bits to left and right
qb_err_t qb_vec_buf_init (qb_vec_buf_t *v,
                          u8_t vectype,
                          void *data, // may be NULL
                          s64_t linear_len,
                          s64_t pre_overlap,
                          s64_t post_overlap,
                          u8_t dp, // display progress
                          char *indent_s) {

  float *fd;
  //double *dd;
  s64_t n;
  qb_err_t err;
  s64_t rounded_up_linear_len;
  
  err = QB_E_OK;
  fd = NULL;
  //dd = NULL;
  
  if (NULL != data) {
    if (QB_VECTYPE_FLOAT == vectype) {
      fd = (float *) data;
    //} else if (QB_VECTYPE_DOUBLE == vectype) {
    //  dd = (double *) data;
    } else {
      fprintf(QB_ERR, "B: %s: bad vectype %u\n", QB_FUNC_M, vectype);
      return QB_E_BUG;
    }
  }
  memset(v, 0, sizeof(qb_vec_buf_t));
  v->type = vectype;
  v->linear_len = linear_len;
  
  // assume V is vector size, = 8
  // assume H is history len, = 2
  
  // for main part:
  
  // A = ((ip_len - 1) | (V-1)) + 1    to get it rounded up to 8 (=32)
  // B = A / V                         to split it 8 ways, chunk len is 4
  // C = B + H + R                     add space for the left-side values (H) and right-side values (R)
  
  // then qb_malloc(sizeof(qb_d8vec_t) * C), and zero it
  
  rounded_up_linear_len = ((linear_len - 1) | (QB_VECSIZE - 1)) + 1;    // A = ((ip_len - 1) | (V-1)) + 1
  v->piece_len          = rounded_up_linear_len / QB_VECSIZE;         // B = A / V
  v->alloc              = v->piece_len + pre_overlap + post_overlap;  // C = B + H + R
  
  do {
  
    if (v->alloc >= QB_MAX_IPFILE_SAMPLES || (v->alloc < 0)) {
      fprintf(QB_ERR, "E: %s: vector data alloc too large\n", QB_FUNC_M);
      err = QB_E_SNDFILE_EXCESSIVE_FRAMES;
      break;
    }
    
    if (QB_VECTYPE_FLOAT == vectype) {
      v->v.f = qb_malloc (sizeof(qb_vec_f_t) * v->alloc);
      if (NULL == v->v.f) { err = QB_E_MALLOC; }
      memset(v->v.f, 0, (sizeof(qb_vec_f_t) * v->alloc));
    } //else {
      //v->v.d = qb_malloc (sizeof(qb_vec_d_t) * v->alloc);
      //if (NULL == v->v.d) { err = QB_E_MALLOC; }
    //}
    if (QB_E_OK != err) {
      fprintf(QB_ERR, "E: Out of memory allocating vector data.\n");
      err = QB_E_MALLOC;
      break;
    }
    
    //v->valid_ix = pre_overlap;
    v->zero_ix  = pre_overlap;
    
    // how do we decide where to place each value of the
    // square-and-mix output into the vectors? let's call the linear
    // input index I, and the corresponding input value X
    
    // for the main part (assuming 32 bytes of input, and vecsize of 8)
    
    // D = (I % B) + H ( which vector?  2, 3, 4, 5, 2, 3, 4, 5 ... ) -- history is 2
    // E =  I / B      ( which row?     0, 0, 0, 0, 1, 1, 1, 1 ... )
    
    // so for main piece,
    // vectors[D][E] = X
    
    // handle duplications:
    //   if D >= (B - H), copy to left side starting at [0] (on next row)
    //   if D <  (H + R), copy to right side starting at [D + B] (on previous row)
    
    if (data != NULL) {
    
      printf("%sSwizzling: ", indent_s);
      fflush(stdout);
      qb_show_meter(dp);
      
      if (QB_VECTYPE_FLOAT == vectype) {
        //memset(v->v.f, 0, sizeof(qb_vec_f_t) * v->alloc); // FIXME: slow
        // zero the elements that won't be filled in by the main loop
        for (n=0; n < pre_overlap; n++) {
          ((float *) &(v->v.f[n]))[0] = 0.0f;
        }
        for (n=(pre_overlap + v->piece_len); n < (pre_overlap + post_overlap + v->piece_len); n++) {
          ((float *) &(v->v.f[n]))[QB_VECSIZE - 1] = 0.0f;
        }
        // load float data
        // each input value needs to be written to 1, 2 or 3 places
        for (n=0; n < linear_len; n++) {
          s64_t d, e, f;
          f = (n % v->piece_len); // indexed to start of main piece
          d = pre_overlap + f;    // indexed to start of buffer
          e = n / v->piece_len;   // row (for main piece)
          // main piece
          ((float *) &(v->v.f[d]))[e] = fd[n];
          // duplicate to left side
          if ( (f >= (v->piece_len - pre_overlap)) && (e < (QB_VECSIZE - 1)) ) { // don't write to row beyond end of vector
    //printf("v->v.f[%lld][%lld] = %f\n", d - v->piece_len, e+1, fd[n]);
            ((float *) &(v->v.f[d - v->piece_len]))[e+1] = fd[n];
          }
          // duplicate to right side
          if ( (f < post_overlap) && (e > 0) ) {
            ((float *) &(v->v.f[d + v->piece_len]))[e-1] = fd[n];
          }
          qb_update_meter (dp, n, linear_len, 1.0f, 0);
        }
      } /* else {
        //memset(v->v.d, 0, sizeof(qb_vec_d_t) * v->alloc); // FIXME: slow
        // zero the elements that won't be filled in by the main loop
        for (n=0; n < pre_overlap; n++) {
          v->v.d[n][0] = 0.0;
        }
        for (n=(pre_overlap + v->piece_len); n < (pre_overlap + post_overlap + v->piece_len); n++) {
          v->v.d[n][QB_VECSIZE - 1] = 0.0;
        }
        // load float data
        // each input value needs to be written to 1, 2 or 3 places
        for (n=0; n < linear_len; n++) {
          s64_t d, e, f;
          f = (n % v->piece_len); // indexed to start of main piece
          d = pre_overlap + f;    // indexed to start of buffer
          e = n / v->piece_len;   // row (for main piece)
          // main piece
          v->v.d[d][e] = dd[n];
          // duplicate to left side
          if ( (f >= (v->piece_len - pre_overlap)) && (e < (QB_VECSIZE - 1)) ) { // don't write to row beyond end of vector
    //printf("v->v.f[%lld][%lld] = %f\n", d - v->piece_len, e+1, fd[n]);
            v->v.d[d - v->piece_len][e+1] = dd[n];
          }
          // duplicate to right side
          if ( (f < post_overlap) && (e > 0) ) {
            v->v.d[d + v->piece_len][e-1] = dd[n];
          }
          qb_update_meter (dp, n, linear_len, 1.0f, 0);
        }
      } */
      
      qb_hide_meter(dp, 0);
      printf("done.\n");
      
    }
    
  } while (0);
  
  if (QB_E_OK != err) {
    // be sure we don't leave vecbuf in a half-and-half state
    memset(v, 0, sizeof(qb_vec_buf_t));
  }
  
  return err;
  
}


void qb_vec_buf_finish (qb_vec_buf_t *v) {
  if (NULL == v) { return; }
  // if type is VECTYPE_INVALID then it is assumed the buffer is NULL
  if (QB_VECTYPE_FLOAT == v->type) {
    if (NULL != v->v.f) {
      qb_free(v->v.f);
    }
  } /*else if (QB_VECTYPE_DOUBLE == v->type) {
    if (NULL != v->v.d) {
      qb_free(v->v.d);
    }
  }*/
  memset(v, 0, sizeof(qb_vec_buf_t));
}


/*
void qb_vec_buf_debug_print (qb_vec_buf_t *v) {
  s64_t j, k;
  u8_t isf;
  isf = QB_VECTYPE_FLOAT==v->type;
  printf("VECBUF:\n");
  printf("  type = %s, alloc = %lld, zero_ix = %lld\n" // valid_ix = %lld,
         "  piece_len = %lld, linear_len = %lld\n",
         isf ? "float" : "double",
         v->alloc,
         v->zero_ix,
        //v->valid_ix,
         v->piece_len,
         v->linear_len);
  for (k=0; k < 3; k++) {
    s64_t off;
    if (0==k) {
      off=0;
      printf("  start (0):\n");
    //} else if (1==k) {
    //  off = v->valid_ix;
    //  printf("  valid (%lld):\n", v->valid_ix);
    } else if (1==k) {
      off = v->zero_ix;
      printf("  zero (%lld):\n", v->zero_ix);
    } else {
      off = v->zero_ix + 1000000;
      printf("  zero + 1000000 (%lld):\n", v->zero_ix + 1000000);
    }
    for (j=0; j < QB_VECSIZE; j++) {
      s64_t i;
      printf("    ");
      for (i=0; i < 8; i++) {
        printf("|%+.3lf| ", (double) (isf ? v->v.f[i+off][j] : v->v.d[i+off][j]));
      }
      printf("\n");
    }
  }
}
*/
  
#endif // QB_VECTORS_NEW
