/*
 *  Quadbike 2
 *  Copyright (C) 2025 'Diminished'

 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.

 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.

 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/

#include "build.h"

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
#include "fir_vec2.h"
#endif

#include "process.h"
#include "pll.h"
#include "goertzel.h"
#include "util.h"
#include "csw.h"
#include "qbio.h"
#include "sync_pll.h"
#include "sync_goertzel.h"
#include "sync_walk.h"
#include "sync.h"
#include "tapespeed.h"
#include "errcorr.h"
#include "atom.h"
#include "span.h"
#include "inspect.h"

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#ifndef QB_WINDOWS
#include <unistd.h>
#endif

static void print_odd_run_line (s64_t data0,
                                s64_t data1,
                                s64_t squawk0,
                                s64_t squawk1,
                                u8_t have_fix,
                                u8_t after_fix) ;

static qb_err_t eliminate_empty_data_spans (qb_source_t *src);

static qb_err_t determine_span_phases (s64_t srclen_smps,
                                       qb_pll_carrier_t carriers[4],
                                       qb_wave_feature_t *feats,
                                       s64_t num_feats,
                                       s32_t num_spans,
                                       s32_t rate,
                                       u8_t do_pll_phase,
                                       u8_t do_walk_phase,
                                       u8_t phase_scheme,
                                       u8_t sync_method,
                                       u8_t phase_detect_method,
                                       //qb_inspect_t *inspect,
                                       u8_t display_progress,
                                       qb_span_t *spans_inout);

static s8_t get_carrier_shift_for_phase_ix_and_rate (u8_t phase_ix, s32_t rate);
static void free_pll_carriers (qb_pll_carrier_t carriers[4]);
static qb_err_t sanity_check_spans (qb_span_t *spans, s32_t num_spans, u8_t caller_id);


static qb_err_t process_buffer_get_atoms  (float *src_f,
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
                                            qb_vec_buf_t *src_v, // }
#endif
                                            s64_t srclen_smps, // in vector mode, this is set to 0
                                            s32_t rate,
                                            u8_t sync_method,
                                            u8_t error_correction_method,
                                            u8_t no_generic_odd_run_fix, // both PLL cycle insertion, and freq mode even-run constraint
                                            u8_t have_forced_delay,
                                            s8_t forced_delay,
                                            u8_t phase_scheme, // INPUT phase scheme
                                            float scaler_for_2400_power,
                                            qb_inspect_t *inspect,
                                            u8_t display_progress,
                                            u8_t phase_detect_method,
                                            u8_t verbose,
                                            double silence_on_frac,
                                            double signal_on_frac,
                                            double summed_silence_on_frac,
                                            qb_span_t **spans_out,
                                            s32_t *num_spans_out,
                                            float *max_powerX_out);
                                                 
static qb_err_t fetch_atoms(u8_t use_right_channel,
                            u8_t sync_method,
                            u8_t error_correction_method,
                            u8_t no_generic_odd_run_fix,
                            u8_t have_forced_shift,
                            s8_t forced_shift,
                            u8_t phase_scheme,
                            float scaler_for_2400_power,
                            double silence_on_frac,
                            double signal_on_frac,
                            double summed_silence_on_frac,
                            qb_inspect_t* inspect,
                            u8_t display_progress,
                            u8_t phase_method,
                            u8_t verbose,
                            qb_source_t* src);

                                  
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
static qb_err_t qb_run_prefilter_vector  (s32_t sample_rate,
                                          qb_vec_buf_t *vbuf,   // data modified (filtered)
                                          qb_inspect_t *inspect,
                                          u8_t display_progress);
static qb_err_t generate_carriers_vector (qb_vec_buf_t *src_v,
//                                          s64_t srclen_smps,
                                          qb_vec_buf_t *src_freq_doubled_vec,
                                          u8_t generate_carriers[4],
                                          s32_t rate,
                                          qb_pll_carrier_t carriers_out[4],
                                          u8_t display_progress,
                                          qb_inspect_t *inspect);
static void normalise_vector (qb_vec_buf_t *vbuf, u8_t display_progress);
static qb_err_t make_squared_pre_carrier_vector (qb_vec_buf_t *src_v,
                                                 qb_inspect_t *inspect,
                                                 u8_t dp, // display_progress
                                                 qb_vec_buf_t *squared_out);
#else
static qb_err_t qb_run_prefilter_scalar  (s32_t sample_rate,
                                          float *src_f,         // data modified (filtered)
                                          s64_t *srclen_inout,  // modified (shortened by filter delay)
                                          qb_inspect_t *inspect,
                                          u8_t display_progress);
static qb_err_t generate_carriers_scalar  (float *src_f,
                                           s64_t srclen_smps,
                                           float *src_freq_doubled,
                                           u8_t generate_carriers[4],
                                           s32_t rate,
                                           qb_pll_carrier_t carriers_out[4],
                                           u8_t display_progress,
                                           qb_inspect_t *inspect);
static void normalise_scalar (float *buf_inout, s64_t len, u8_t display_progress);
static qb_err_t make_squared_pre_carrier_scalar (float *src_f,
                                                 s64_t srclen_smps,
                                                 qb_inspect_t *inspect,
                                                 u8_t dp,
                                                 float **squared_out);
#endif

static void gauge_need_for_feats_and_carriers (u8_t sync_method,
                                               u8_t phase_scheme,
                                               u8_t phase_detect_method,
                                               u8_t *do_walk_phase_out,
                                               u8_t *do_pll_phase_out,
                                               u8_t *generate_features_out,
                                               u8_t generate_carriers_out[4],
                                               u8_t *need_freq_doubled_out) ;


static qb_err_t fetch_atoms  (u8_t use_right_channel,
                               u8_t sync_method,
                               u8_t error_correction_method,
                               u8_t no_generic_odd_run_fix,
                               u8_t have_forced_shift,
                               s8_t forced_shift,
                               u8_t phase_scheme,
                               float scaler_for_2400_power,
                               double silence_on_frac,
                               double signal_on_frac,
                               double summed_silence_on_frac,
                               qb_inspect_t *inspect,
                               u8_t display_progress,
                               u8_t phase_method,
                               u8_t verbose,
                               qb_source_t *src) {
                                    
  qb_err_t e;
  float *buf;
  
  qb_pcm_t *pcm;
  
  e = QB_E_OK;
  pcm = &(src->pcm);
  
  if ( ! use_right_channel ) {
    //buf = pcm->buf_L;
    printf ("  Left channel:\n");
  } else {
    //buf = pcm->buf_R;
    printf ("  Right channel:\n");
  }
  
  buf = pcm->buf;
  e = process_buffer_get_atoms  (buf,
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
                                  &(pcm->vbuf),
#endif
                                  pcm->buflen,
                                  pcm->rate,
                                  sync_method,
                                  error_correction_method,
                                  no_generic_odd_run_fix,
                                  have_forced_shift,
                                  forced_shift,
                                  phase_scheme,
                                  scaler_for_2400_power,
                                  inspect,
                                  display_progress,
                                  phase_method,
                                  verbose,
                                  silence_on_frac,
                                  signal_on_frac,
                                  summed_silence_on_frac,
                                  &(src->spans),
                                  &(src->num_spans),
                                  &(src->max_powerX));
                                        
  src->right_channel = use_right_channel ? 0 : 1;
  src->len_smps = pcm->buflen;
  
  if (QB_E_OK != e) {
    // buffer failed
    src->len_smps = 0;
    return e;
  }
    
  return QB_E_OK;
  
}



#include "audio.h"
#include "polarity.h"
#include "tibet.h"

// need argc and argv so that we can put them into the TIBET file
qb_err_t qb_process_main (qb_t *qb, int argc, char **argv2) {

  qb_err_t e;
  qb_source_t src;
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
  float *filtered_scalar;
#endif
  
  memset(&src, 0, sizeof(qb_source_t));

  e = qb_decode_audio (qb->ipfile_buf,
                       qb->ipfile_buf_len,
                       qb->ipfile_path,
                       qb->use_right_channel,
                       &(src.pcm),
                       ! qb->quieter);
  if (QB_E_OK != e) { return e; }
  
  // 2.0:
  qb_free(qb->ipfile_buf);
  qb->ipfile_buf = NULL;
  qb->ipfile_buf_len = 0;
  
  // sanity
  if ( QB_E_OK != (e = qb_test_rate_legal (src.pcm.rate) ) ) {
    fprintf(QB_ERR, "B: %s: src.pcm.rate is illegal (%d)\n", QB_FUNC_M, src.pcm.rate);
    qb_finish_source(&src);
    return e;
  }
  
  qb->sample_rate = src.pcm.rate;
  printf("  Sample rate is %d.\n", qb->sample_rate);
  
  // have to do this after load, so we know what the sample rate is
  
  if (qb->inspect.enabled) {
    e = qb_inspect_init (&(qb->inspect), qb->sample_rate);
    if (QB_E_OK != e) {
      qb_finish_source(&src);
      return e;
    }
  }
  
  if (qb->do_filtering) {
    do {
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
      e = qb_run_prefilter_vector (src.pcm.rate, &(src.pcm.vbuf), &(qb->inspect), ! qb->quieter);
      if (QB_E_OK != e) { break; }
      e = qb_vec_buf_unswizzle (&(src.pcm.vbuf), &filtered_scalar, /*NULL,*/ ! qb->quieter, "  ", "");
      if (QB_E_OK != e) { break; }
      // replace scalar copy too
      qb_free(src.pcm.buf);
      src.pcm.buf = filtered_scalar;
#else
      // shortens pcm->buflen by filter delay
      // (data are moved backwards in the buffer to compensate for filter delay)
      e = qb_run_prefilter_scalar (src.pcm.rate,
                                   src.pcm.buf,
                                   &(src.pcm.buflen),
                                   &(qb->inspect),
                                   ! qb->quieter);
#endif
    } while (0);
    if (QB_E_OK != e) {
      qb_finish_source(&src);
      return e;
    }
  }
    
  if (qb->normalise) {
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
    normalise_vector (&(src.pcm.vbuf), ! qb->quieter);
#else
    normalise_scalar (src.pcm.buf, src.pcm.buflen, ! qb->quieter);
#endif

  }
  
  do {
  
    s32_t sn;
    s16_t previous_atom_value;
    s64_t prev_smpnum, prev_smpnum_2;
    qb_atom_t *atom0;

    // this function actually does the overwhelming majority of the work:
    e = fetch_atoms (qb->use_right_channel,
                     qb->sync_method,
                     qb->err_correction_strategy,
                     qb->no_generic_odd_run_fix,
                     qb->have_forced_shift,
                     qb->forced_shift,
                     qb->phase_scheme,
                     qb->scaler_for_2400_power,
                     qb->silence_on_frac,
                     qb->signal_on_frac,
                     qb->summed_silence_on_frac,
                     &(qb->inspect),
                     ! qb->quieter,
                     qb->phase_detect_method,
                     qb->verbose,
                     &src);   // populated

    // finished with input audio data
    qb_finish_pcm (&(src.pcm));
    
    if (QB_E_OK != e) { break; }
    
//printf("w"); fflush(stdout);
    
    e = eliminate_empty_data_spans (&src);
    if (QB_E_OK != e) { break; }
    
//printf("x"); fflush(stdout);

    // populate atom->value field:
    e = qb_atoms_assign_values (&src, ! qb->quieter);
    if (QB_E_OK != e) { break; }
    
//printf("y"); fflush(stdout);
    
    if (qb->inspect.enabled) {
      previous_atom_value = QB_DBG_HALFBIT_VALUE_SILENCE;
      prev_smpnum   = 0;
      prev_smpnum_2 = 0;
      for (sn=0; sn < src.num_spans; sn++) {
        qb_span_t *span0, *span1;
        span1 = NULL;
        span0 = src.spans + sn;
        if (sn < (src.num_spans - 1)) {
          span1 = span0 + 1;
        }
        e = qb_inspect_append_bitsamples (span0,
                                          span1, // or NULL
                                          &prev_smpnum, // updated
                                          sn,
                                          qb->inspect.files + QB_INSPECT_FILE_IX_CYCLE_TIMES);
        e = qb_inspect_append_halfbit_values (span0,
                                              span1, // or NULL
                                              sn,
                                              qb->inspect.files + QB_INSPECT_FILE_IX_CYCLE_VALUES,
                                              &previous_atom_value,
                                              &prev_smpnum_2); // updated
        if (QB_E_OK != e) { break; }
      }
      if (QB_E_OK != e) { break; }
    } // endif (inspect)
    
    if (0 == src.num_spans) {
      fprintf(QB_ERR, "E: no spans\n");
      e = QB_E_NO_SIGNAL;
      break;
    }
    
//printf("z"); fflush(stdout);
    
    // hax0r the first cycle so that it starts at zero
    atom0 = src.spans[0].atoms + 0;
    if (atom0->sample_num > 0) {
      fprintf(QB_ERR, "  W: nudging first atom: [%lld] -> [0]\n",
              atom0->sample_num);
      atom0->sample_num = 0;
    }
    
//printf("a"); fflush(stdout);

#ifdef QB_SANITY
    // make sure -s freq output sets invalid phase on all spans
    if (QB_SYNC_METHOD_GOERTZEL == qb->sync_method) {
      for (sn=0; sn < src.num_spans ; sn++) {
        qb_span_t *span;
        span = src.spans + sn;
        if (span->detected_input_phase_ix != -1) {
          fprintf(QB_ERR, "B: %s: freq sync method, but span %u has a valid phase ix (%d, should be -1)\n",
                  QB_FUNC_M, sn, span->detected_input_phase_ix);
          e = QB_E_BUG;
          break;
        }
      }
      if (QB_E_OK != e) { break; }
    }
#endif
    
    if (NULL != qb->tibet_opfile) {
      e = qb_write_tibet (src.spans,
                          src.num_spans,
                          qb->sample_rate,
                          qb->allow_overwrite,
                          qb->tibet_opfile,
                          0, // gzip
                          argc,
                          argv2);
    }
    if (QB_E_OK != e) { break; }
    
    if (NULL != qb->tibet_z_opfile) {
      e = qb_write_tibet (src.spans,
                          src.num_spans,
                          qb->sample_rate,
                          qb->allow_overwrite,
                          qb->tibet_z_opfile,
                          1, // gzip
                          argc,
                          argv2);
    }
    if (QB_E_OK != e) { break; }
    
//printf("b"); fflush(stdout);
    
    if (NULL != qb->csw_opfile) {
    
      // this needs to happen AFTER the call to qb_atoms_assign_values(),
      // because reversing the polarity entails adding a fake 0-valued atom, which
      // is just a single pulse. If atom values continued to be determined by measuring
      // instantaneous Goertzel power as was the case before, then a remedial atom added to
      // the start of a leader span would be measured as a '1'. The polarity
      // reversal would fail, as a '1' is not a single pulse, but a pair of pulses.
      
      if ( ! qb->no_csw_polarity_correction ) {
        e = qb_spans_scan_polarities (src.spans,
                                      src.num_spans,
                                      NULL,
                                      1, // print output, and mark spans for alteration
                                      qb->verbose);
        if (QB_E_OK != e) { break; }
        
        // NOTE: from this point on, silent spans can invert polarity (and contain 2-3 pulses),
        // and leader spans can also invert polarity (and contain '0'-cycles, to swap polarity;
        // remember, 1-cycles are always written as pulse pairs and 0-cycles are written as
        // single pulses)

        e = qb_correct_polarities (src.spans,
                                   src.num_spans,
                                   qb->sample_rate,
                                   qb->ipfile_buf_len,
                                   qb->verbose);
        if (QB_E_OK != e) { break; }
      }
      
//printf("e"); fflush(stdout);

      e = qb_write_csw (qb->csw_opfile,
                        src.spans,
                        src.num_spans,
                        qb->sample_rate,
                        src.len_smps,
                        qb->allow_overwrite);
                        // ! qb->quieter);
      if (QB_E_OK != e) { break; }
                     
    } // endif (write CSW)

  } while (0);
  
  qb_finish_source(&src);

  return e;

}



static qb_err_t eliminate_empty_data_spans (qb_source_t *src) {
  s32_t sn;
  for (sn=0; sn < src->num_spans; sn++) {
    qb_span_t *span;
    span = src->spans + sn;
    if ((QB_SPAN_TYPE_DATA == span->type) && (0 == span->num_atoms)) {
      fprintf(QB_ERR, "  W: [%lld]: Removing empty data span\n",
              qb_get_span_start_accurate_or_rough(span));
      // xxxxxxx, len 7
      //   ^=2
      // xxXXXX_
      memmove (src->spans + sn,
               src->spans + sn + 1,
               sizeof(qb_span_t) * (src->num_spans - (sn + 1)));
      (src->num_spans)--;
    }
  }
  return QB_E_OK;
}




static qb_err_t sanity_check_spans (qb_span_t *spans, s32_t num_spans, u8_t caller_id) {

  s32_t i;

  if (spans[0].start != 0) {
    fprintf(QB_ERR, "B: %s: span 0 doesn't start at 0 smps! (%lld smps)\n", QB_FUNC_M, spans[0].start);
    return QB_E_BUG;
  }

  for (i=0; i < num_spans; i++) {
  
    qb_span_t *sp;
    
    sp = spans + i;
    
    if (sp->num_atoms < 0) {
      fprintf(QB_ERR, "B: %s <%d>: num_cycs < 0 (%d)\n", QB_FUNC_M, caller_id, sp->num_atoms);
      return QB_E_BUG;
    }
    
if ((i>0) && (sp->type == (sp-1)->type)) {
  fprintf(QB_ERR, "W: %s (caller id %u); spans %d and %d have same type\n",
          QB_FUNC_M, caller_id, i, i-1);
}
    
    //if (sp->atoms == NULL) {
    //  fprintf(QB_ERR, "B: sanity_check_spans (%d): cycs == NULL\n", sanity_check_spans_count);
    //  return QB_E_BUG;
    //}
    if (    (i<(num_spans-1))
         && ((sp+1)->num_atoms>0)
         && (sp->num_atoms>0)
         && ((sp+1)->atoms[0].sample_num < (sp->atoms[sp->num_atoms-1].sample_num))) {
      fprintf(QB_ERR, "B: %s: <%d>: span %d first cycle (%lld) occurs prior to span %d final cycle (%lld)\n",
              QB_FUNC_M,
              caller_id,
              i+1,
              (sp+1)->atoms[0].sample_num,
              i,
              sp->atoms[sp->num_atoms-1].sample_num);
              //(sp+1)->atoms[0].sample_num, sp->atoms[sp->num_atoms-1].sample_num);
      return QB_E_BUG;
    }
    
  }
  
  return QB_E_OK;
  
}


u16_t qb_get_phase_for_phase_ix (s8_t ix) {
  if ((ix<0) || (ix>3)) {
    return 0xffff;
  }
  return ((u16_t) ix) * 90;
}

char qb_get_char_for_phase_ix (s8_t p) {
  if      (p==0) { return 'a';  }
  else if (p==1) { return 'b';  }
  else if (p==2) { return 'c'; }
  else if (p==3) { return 'd';  }
  else           { return '?';  }
}

static s8_t get_carrier_shift_for_phase_ix_and_rate (u8_t phase_ix, s32_t rate) {
  
  // values determined by experiment
  // range is -9 to 9
  
  if (22050 == rate) {
    if (0 == phase_ix) {
      return -7;
    } else if (1 == phase_ix) {
      return 8;
    } else if (2 == phase_ix) {
      return 6;
    } else {
      return -5;
    }
  } else if (48000 == rate) {
    if (0 == phase_ix) {
      return 5;
    } else if (1 == phase_ix) {
      return -1;
    } else if (2 == phase_ix) {
      return -5;
    } else {
      return 9;
    }
  } else if (44100 == rate) {
    if (0 == phase_ix) { // 0
      return 4;
    } else if (1 == phase_ix) { // 90
      return 0; // (was -1)
    } else if (2 == phase_ix) { // 180
      return -5;
    } else { // 270
      return -9; // (was 9)
    }
  }
  return 0;
}


static qb_err_t trim_data_spans_start (qb_span_t *spans, s32_t *num_spans_inout) {
  // eliminate any 1-cycles at start of data spans, possibly delete those spans if now empty
  s32_t sn;
  for (sn=1; sn < *num_spans_inout; sn++ ) {
    qb_span_t *span;
    s32_t an;
    span = spans + sn;
    // need leader, data:
    if (QB_SPAN_TYPE_DATA != span->type) {
      continue;
    }
    if (QB_SPAN_TYPE_LEADER != (span-1)->type) {
      continue;
    }
    for (an=0; (an < span->num_atoms) && (qb_atom_get_value_bool(span->atoms + an) != 0) ; an++)
      ;
    if (0 == an) {
      continue; // no change
    }
    // erase leading 1-cycles
    memmove (span->atoms,
             span->atoms + an,
             sizeof(qb_atom_t) * (span->num_atoms - an));
    span->num_atoms -= an;
    //span->start = span->atoms[0].sample_num;
    //if (sn > 0) {
    //  (span-1)->len = span->atoms[0].sample_num - (span-1)->atoms[0].sample_num;
    //}
    
    if (0 == span->num_atoms) {
      // in fact I removed all the putty
      printf("    [%lld] Removing now-empty data span #%d.\n",
             qb_get_span_start_accurate_or_rough(span), sn);
      (span-1)->len += span->len;
      qb_free(span->atoms);
      memmove(spans + sn,
              spans + sn + 1,
              sizeof(qb_span_t) * ((*num_spans_inout) - (1 + sn)));
      (*num_spans_inout)--;
    }
    
  }
  return QB_E_OK;
}


/*
static qb_err_t scan_and_convert_empty_data_spans (qb_span_t *spans, s32_t num_spans) {
  s32_t sn;
  for (sn=0; sn < num_spans; sn++) {
    qb_span_t *span;
    s32_t an;
    span = spans + sn;
    if (QB_SPAN_TYPE_DATA != span->type) {
      continue;
    }
    if (0 == span->num_atoms) {
      fprintf(QB_ERR, "B: [%lld]: scan_and_convert_empty_data_spans: no cycs on span #%d\n",
              qb_get_span_start_accurate_or_rough(span), sn);
      return QB_E_BUG;
    }
    for (an=0; (an < span->num_atoms) && ( ! qb_atom_get_value_bool(span->atoms + an) ); an++)
      ;
    if (an >= span->num_atoms) {
      // empty data span -- no zero-cycles!
      printf("M: [%lld] Data span #%d only contained 1-cycles; converting it to leader\n",
             qb_get_span_start_accurate_or_rough(span), sn);
      qb_span_set_type(span, QB_SPAN_TYPE_LEADER);
      // delete all the cycles
      qb_free(span->atoms);
      span->atoms = NULL;
      span->num_atoms = 0;
    }
  }
  return QB_E_OK;
}
*/

/*

              QUALIFIER  |           OPERATION             |         INPUT
-------------------------+---------------------------------+-------------------------
               [scalar]  | qb_malloc Goerztel                 |
                         |                                 |
                         | oversample Goertzel             | scalar PCM / vector PCM
                         |                                 |
               [vector]  | qb_malloc & unswizzle Goertzel,    | vector Goertzel
                         | free swizzled Goertzel          |
                         |                                 |
              [inspect]  | inspect Goertzel                | scalar Goertzel
                         |                                 |
                         | mark spans                      | scalar PCM
                         |                                 |
                         | tape speeds                     | scalar PCM
                         |                                 |
                         | carrier & features logic        |
                         |                                 |
     [walk, walk-phase]  | generate features               | scalar PCM
                         |                                 |
       [PLL, PLL-phase]  | square PCM                      | vector PCM
                         |                                 |
       [PLL, PLL-phase]  | generate carriers               | vector PCM
                         |                                 |             (DONE WITH VECTOR PCM HERE unless PLL vectorised)
                         |                                 |
[PLL-phase, walk-phase]  | measure phases                  | [walk] features list
                         |                                 | [PLL]  scalar PCM, ... or vector PCM, if PLL can be vectorised
                         |                                 |
                         | get sync cycle times            | [walk] features list
                         |                                 | [PLL]  scalar PCM, ... or vector PCM, if PLL can be vectorised
                         |                                 | [freq] scalar Goertzel
                         |                                 |             (DONE WITH FEATURES LIST HERE)
                         |                                 |
                 [walk]  | interpolate missing cycles      | cycles list
                         |                                 |
                         | measure power                   | cycles list, scalar Goertzel
                         |                                 |
                         | amalgamate spans                | cycles list
                         |                                 |
                         | -e error correction             | cycles list, scalar Goerzel (?) }
                         |                                 |                                 } ??? FIXME? do we measure power
                         | [PLL] cycle insertion           |                                 } with these error correction modes???? CHECK
                         |                                 |
                         | populate leader spans           |
                         |                                 |
              [inspect]  | span types                      |
                         |                                 |
                         | populate silent spans (2 cycs)  |
                         |                                 |
                         | clean up                        |
                         |                                 |
*/


// FIXME: this is a monster, refactor the hell out of it
static qb_err_t process_buffer_get_atoms  (float *src_f,
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
                                            qb_vec_buf_t *src_v, // }
#endif
                                            s64_t srclen_smps, // in vector mode, this is set to 0
                                            s32_t rate,
                                            u8_t sync_method,
                                            u8_t error_correction_method,
                                            u8_t no_generic_odd_run_fix, // both PLL cycle insertion, and freq mode even-run constraint
                                            u8_t have_forced_delay,
                                            s8_t forced_delay,
                                            u8_t phase_scheme, // INPUT phase scheme
                                            float scaler_for_2400_power,
                                            qb_inspect_t *inspect,
                                            u8_t display_progress,
                                            u8_t phase_detect_method,
                                            u8_t verbose,
                                            double silence_on_frac,
                                            double signal_on_frac,
                                            double summed_silence_on_frac,
                                            qb_span_t **spans_out,
                                            s32_t *num_spans_out,
                                            float *max_powerX_out) {

  qb_err_t e;
  u32_t winlen;
  float omega0_1200, omega1_2400;
  float max_power0, max_power1, max_powerX, max_confidence;
  float max_span_tape_speed, min_span_tape_speed;
  u8_t generate_carriers[4] = { 0, 0, 0, 0 };
  qb_pll_carrier_t carriers[4];
  qb_wave_feature_t *feats;
  s64_t num_feats;
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
  qb_vec_buf_t freq_doubled_vec;
  qb_vec_buf_t goertz_vec[2];
#else
  float *freq_doubled;
#endif

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
  memset(&freq_doubled_vec, 0, sizeof(qb_vec_buf_t));
  memset(goertz_vec, 0, 2 * sizeof(qb_vec_buf_t));
#endif

  float *goertz[2] = {NULL, NULL};
  
  e = QB_E_OK;

  memset(carriers, 0, 4 * sizeof(qb_pll_carrier_t));
  
  winlen = 0;
  
  // assume tape speed of 1.0
  // --
  qb_compute_window_lengths (1.0f, rate, &winlen);
  qb_compute_omegas (1.0f, rate, &omega0_1200, &omega1_2400);
  
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
  // no linear goertzel buffers for vector mode yet --
  // we'll unswizzle them later
  memset(&(goertz_vec[0]), 0, sizeof(qb_vec_buf_t));
  memset(&(goertz_vec[1]), 0, sizeof(qb_vec_buf_t));
#else
  goertz[0] = qb_malloc (sizeof(float) * srclen_smps);
  if (NULL == goertz[0]) {
    fprintf(QB_ERR, "E: Out of memory allocating for Goertzel oversamples.\n");
    return QB_E_MALLOC;
  }
  goertz[1] = qb_malloc (sizeof(float) * srclen_smps);
  if (NULL == goertz[1]) {
    fprintf(QB_ERR, "E: Out of memory allocating for Goertzel oversamples.\n");
    qb_free(goertz[0]);
    return QB_E_MALLOC;
  }
  freq_doubled = NULL;
#endif
  
  feats = NULL;
  num_feats = 0;
  
  do {
  
    s32_t sn;
    u8_t do_pll_phase;
    u8_t do_walk_phase;
    u8_t generate_features;
    //u8_t pre_inserts_inspect_file_written;
    u8_t w;
    u8_t need_freq_doubled;
    s64_t prev_final_smpnum;
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
    qb_vec_f_t two_cos_omega_vec_0;
    qb_vec_f_t two_cos_omega_vec_1;
    qb_vec_f_t scaler_for_2400_power_vec;
    u8_t v;
#ifdef QB_VECTORS_MSVC_AVX2
    float *two_cos_omega_scalar_0;
    float *two_cos_omega_scalar_1;
    float *scaler_for_2400_power_scalar;
#endif // MSVC_AVX2
#endif // vectors

    //pre_inserts_inspect_file_written = 0;
    do_pll_phase   = 0;
    do_walk_phase  = 0;
    generate_features = 0;
    need_freq_doubled = 0;
//    freq_doubled = NULL;

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
#ifdef QB_VECTORS_MSVC_AVX2
    two_cos_omega_scalar_0       = (float *) &two_cos_omega_vec_0;
    two_cos_omega_scalar_1       = (float *) &two_cos_omega_vec_1;
    scaler_for_2400_power_scalar = (float *) &scaler_for_2400_power_vec;
    for (v=0; v < QB_VECSIZE; v++) {
      two_cos_omega_scalar_0[v] = 2.0f * cosf(omega0_1200);
      two_cos_omega_scalar_1[v] = 2.0f * cosf(omega1_2400);
      scaler_for_2400_power_scalar[v] = scaler_for_2400_power;
    }
#else
    for (v = 0; v < QB_VECSIZE; v++) {
      two_cos_omega_vec_0[v] = 2.0f * cosf(omega0_1200);
      two_cos_omega_vec_1[v] = 2.0f * cosf(omega1_2400);
      scaler_for_2400_power_vec[v] = scaler_for_2400_power;
    }
#endif
    
    e = qb_goertzel_oversample_vector_new (src_v, // slightly modified (some zeros are written to row 0, before zero_ix)
                                          winlen,
                                          &two_cos_omega_vec_0,
                                          &two_cos_omega_vec_1,
                                          &scaler_for_2400_power_vec,
                                          &max_power0,      // populated
                                          &max_power1,      // populated
                                          &max_powerX,      // populated
                                          &max_confidence,  // populated
                                          goertz_vec,       // populated
                                          display_progress);
    if (QB_E_OK != e) { break; }
#else // no vectors
    e = qb_goertzel_oversample  (src_f,
                                 srclen_smps,
                                 winlen,
                                 omega0_1200,     // note that this is done with assumed tape speed of 1.0
                                 omega1_2400,
                                 scaler_for_2400_power,
                                 &max_power0,     // populated
                                 &max_power1,     // populated
                                 &max_powerX,     // populated
                                 &max_confidence, // populated
                                 goertz,
                                 display_progress);
    if (QB_E_OK != e) { break; }
#endif // no vectors

// vector sanity check
/*
float *sanity0, *sanity1;
qb_vec_buf_unswizzle (&(goertz_vec[0]), &sanity0, NULL);
qb_vec_buf_unswizzle (&(goertz_vec[1]), &sanity1, NULL);
for (x=0; x < src_v->linear_len; x++) {
  if (sanity0[x] != goertz[0][x]) {
    printf("fail0 at x=%lld\n", x);
  }
  if (sanity1[x] != goertz[1][x]) {
    printf("fail1 at x=%lld\n", x);
  }
}
qb_free(sanity0);
qb_free(sanity1);
*/

    // unswizzle vectorised Goertzel now
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
    // _unswizzle calls will qb_malloc goertz[0], [1]
    e = qb_vec_buf_unswizzle (&(goertz_vec[0]), &(goertz[0]), /*NULL,*/ display_progress, "    ", " (1200 Hz)");
    e = qb_vec_buf_unswizzle (&(goertz_vec[1]), &(goertz[1]), /*NULL,*/ display_progress, "    ", " (2400 Hz)");
    // done with the vectorised copy now, so
    qb_vec_buf_finish(&(goertz_vec[0]));
    qb_vec_buf_finish(&(goertz_vec[1]));
    memset(goertz_vec, 0, 2 * sizeof(qb_vec_buf_t));
#endif

    if (inspect->enabled) {
      e = qb_inspect_append_span_goertzel_power (goertz,
                                                 srclen_smps,
                                                 max_powerX,
                                                 max_confidence,
                                                 inspect);
      if (QB_E_OK != e) {
        fprintf(QB_ERR, "W: Could not write oversampled Goertzel files, for some reason.\n");
      }
    }
    if (QB_E_OK != e) { break; }
    
    // alpha 2
    // GET SPANS
    e = qb_mark_spans (goertz,
                       max_powerX,
                       srclen_smps,
                       rate,
                       silence_on_frac,
                       signal_on_frac,
                       summed_silence_on_frac,
                       spans_out,
                       num_spans_out);
    if (QB_E_OK != e) { break; }
    
//if (QB_E_OK != sanity_check_spans(*spans_out, *num_spans_out)) { return QB_E_BUG; }
    
    // WORK OUT TAPE SPEEDS FOR EACH SPAN
    e = qb_spans_populate_tape_speeds (*spans_out,
                                       *num_spans_out,
                                       rate,
                                       src_f, // vector version still uses scalar src_f
                                       &max_span_tape_speed,
                                       &min_span_tape_speed,
                                       display_progress,
                                       verbose,
                                       inspect);
    if (QB_E_OK != e) { break; }
    
    /*
#ifdef QB_HAVE_ZLIB
    u8_t *ser_speed_buf;
    size_t ser_speed_buf_len;
    ser_speed_buf = NULL;
    ser_speed_buf_len = 0;
    e = qb_serialise_tape_speeds (*spans_out, *num_spans_out, &ser_speed_buf, &ser_speed_buf_len);
    if (QB_E_OK != e) { break; }
    printf("Serialised tape speeds: ");
    for (q=0; q < ser_speed_buf_len; q++) {
      printf("%02x", ser_speed_buf[q]);
    }
    printf("\n");
#endif
    */

    gauge_need_for_feats_and_carriers  (sync_method,
                                        phase_scheme,
                                        phase_detect_method,
                                        &do_walk_phase,      // out
                                        &do_pll_phase,       // out
                                        &generate_features,  // out
                                        generate_carriers,   // out
                                        &need_freq_doubled); // out
    
    if (generate_features) {
      e = qb_walk_identify_features (src_f,
                                     srclen_smps,
                                     *spans_out,
                                     *num_spans_out,
                                     &feats,
                                     &num_feats);
      if (QB_E_OK != e) { break; }
      if (inspect->enabled) {
        e = qb_inspect_write_features (inspect, feats, num_feats, srclen_smps);
#ifdef QB_EXPERIMENTAL_INSPECT_WRITE_FEATURES_TIMING
        if (QB_E_OK == e) {
          e = qb_inspect_write_features_timing (inspect, feats, num_feats, srclen_smps, rate);
        }
#endif
      }
    } // endif (generate features)
    
    if (QB_E_OK != e) { break; }
    
    if (need_freq_doubled) {
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
      e = make_squared_pre_carrier_vector (src_v,
                                           inspect,
                                           display_progress,
                                           &freq_doubled_vec);
#else
      e = make_squared_pre_carrier_scalar (src_f,
                                           srclen_smps,
                                           inspect,
                                           display_progress,
                                           &freq_doubled);
#endif
    }
//exit(0);
    if (QB_E_OK != e) { break; }
    
//printf("generate_carriers: { %u, %u, %u, %u }\n",
//       generate_carriers[0],
//       generate_carriers[1],
//       generate_carriers[2],
//       generate_carriers[3]);
    
    if (    generate_carriers[0]
         || generate_carriers[1]
         || generate_carriers[2]
         || generate_carriers[3]) {
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
      e = generate_carriers_vector (src_v,
                                    &freq_doubled_vec,
                                    generate_carriers,
                                    rate,
                                    carriers,
                                    display_progress,
                                    inspect);
#else
      e = generate_carriers_scalar (src_f,
                                   srclen_smps,
                                   freq_doubled,
                                   generate_carriers,
                                   rate,
                                   carriers,
                                   display_progress,
                                   inspect);
#endif
    }
    if (QB_E_OK != e) { break; }

    if (QB_SYNC_METHOD_GOERTZEL != sync_method) {
      e = determine_span_phases (srclen_smps,
                                 carriers,
                                 feats,
                                 num_feats,
                                 *num_spans_out,
                                 rate,
                                 do_pll_phase,
                                 do_walk_phase,
                                 phase_scheme,
                                 sync_method,
                                 phase_detect_method,
                                 display_progress,
                                 *spans_out);
      if (QB_E_OK != e) { break; }
    }
    
#ifdef QB_SANITY
    // make sure -s freq sets invalid phase ix on all spans
    if (QB_SYNC_METHOD_GOERTZEL == sync_method) {
      for (sn=0; sn < *num_spans_out; sn++) {
        qb_span_t *span;
        span = *spans_out + sn;
        if (span->detected_input_phase_ix != -1) {
          fprintf(QB_ERR, "B: %s: freq sync method, but span %u has a valid phase ix (%d, should be -1)\n",
                  QB_FUNC_M, sn, span->detected_input_phase_ix);
          e = QB_E_BUG;
          break;
        }
      }
      if (QB_E_OK != e) { break; }
    }
#endif
    
    e = qb_get_sync_for_data_spans (sync_method,
                                    srclen_smps,
                                    rate,
                                    *spans_out,
                                    *num_spans_out,
                                    carriers,
                                    inspect,
                                    display_progress,
                                    goertz,
                                    max_powerX,
                                    feats,
                                    num_feats,
                                    verbose,
                                    no_generic_odd_run_fix); // for frequency mode
    if (QB_E_OK != e) { break; }
    
    // *** FIXME ****
    // **************
    // CAN WE FREE PCM HERE? DON'T THINK IT'S USED AGAIN
    // **************
    
    if (NULL != feats) { // finished with these
      qb_free(feats);
      feats = NULL;
      num_feats = 0;
    }
    
    // interpolation for walk mode
    if (QB_SYNC_METHOD_WALK == sync_method) { // && do_interpolation) {
      e = qb_spans_walk_interp_absent_cycs (inspect, *spans_out, *num_spans_out, rate);
    }
    if (QB_E_OK != e) { break; }
    
    // data spans only
    e = qb_take_power_samples (
#ifdef QB_SANITY
                               srclen_smps,
#endif
                               *spans_out,
                               *num_spans_out,     // unmodified, just an input here
                               //do_delay_search,
                               have_forced_delay,
                               forced_delay,
                               goertz);
    if (QB_E_OK != e) { break; }
    
    if (QB_E_OK != sanity_check_spans(*spans_out, *num_spans_out, 1)) { e = QB_E_BUG; break; }

    e = qb_span_amalgamate_spans (*spans_out, num_spans_out);
    if (QB_E_OK != e) { break; }
    
    if (QB_E_OK != sanity_check_spans(*spans_out, *num_spans_out, 2)) { e = QB_E_BUG; break; }

    e = trim_data_spans_start (*spans_out, num_spans_out); // num_spans_out may change if spans are deleted
    if (QB_E_OK != e) { break; }
    
    if (QB_E_OK != sanity_check_spans(*spans_out, *num_spans_out, 3)) { e = QB_E_BUG; break; }
  
    // ERROR CORRECTION: two ways
    
    // - first we look for adjacent odd *pairs* of runs, where e.g. an incorrect 1-bit
    // at the end of the first run and an incorrect 0-bit at the start of the second
    // run should both be flipped.
    
    // - once that's (optionally) taken care of, we then (optionally) look for any
    // remaining *solo* odd runs, and hopefully fix those by squashing existing cycles and
    // thereby making room to insert a new one.
  
    if (QB_ERRCORR_NONE != error_correction_method) {
      e = qb_do_atomflip_correction_all_spans (*spans_out,
                                               *num_spans_out,
                                               error_correction_method,
                                               verbose);
      if (QB_E_OK != e) { break; }
    }
    
    if (QB_E_OK != sanity_check_spans (*spans_out, *num_spans_out, 4)) { e = QB_E_BUG; break; }
    
    // if (PLL) and (do odd-run correction) and (inspect),
    // write pre-insert cycles inspect file before doing correction
    if (    (QB_SYNC_METHOD_PLL == sync_method)
         && ( ! no_generic_odd_run_fix )
         && inspect->enabled ) {
         
      prev_final_smpnum = 0;
      
      for (sn = 0; sn < *num_spans_out; sn++) {
        e = qb_inspect_append_bitsamples (*spans_out + sn,
                                          (sn < (*num_spans_out - 1)) ? (*spans_out + sn + 1) : NULL,
                                          &prev_final_smpnum,
                                          sn,
                                          inspect->files + QB_INSPECT_FILE_IX_PLL_CYCLE_TIMES_PRE_INSERT);
        if (QB_E_OK != e) { break; }
      }
      
    }
    
u8_t do_odd_run_fix;
u8_t num_passes;

    do_odd_run_fix = (QB_SYNC_METHOD_PLL == sync_method) && ! no_generic_odd_run_fix;
    num_passes = (do_odd_run_fix ? 2 : 1);

    // odd run reporting, and (for PLL) possible correction
    for (w=0; w < num_passes; w++) { // first pass reports, optional second pass fixes
      s64_t num_data_zeroval_odd_runs;
      s64_t num_data_oneval_odd_runs;
      s64_t num_squawk_zeroval_odd_runs;
      s64_t num_squawk_oneval_odd_runs;
      num_data_zeroval_odd_runs   = 0;
      num_data_oneval_odd_runs   = 0;
      num_squawk_zeroval_odd_runs = 0;
      num_squawk_oneval_odd_runs = 0;
      e = qb_spans_count_and_maybe_fix_odd_runs_by_atom_insert (*spans_out,
                                                                *num_spans_out,
                                                                srclen_smps,
                                                                rate,
                                                                (w==1), // fix them? only on second pass
                                                                &num_data_zeroval_odd_runs,
                                                                &num_data_oneval_odd_runs,
                                                                &num_squawk_zeroval_odd_runs,
                                                                &num_squawk_oneval_odd_runs,
                                                                verbose);
      if (QB_E_OK != e) { break; }
      if (0==w) {
        /*
        printf("    Odd runs");
        if ( do_odd_run_fix ) {
          printf(" (before fix)");
        }
        printf(": data[0-val %lld, 1-val %lld] squawk[0-val %lld, 1-val %lld]\n",
               num_data_zeroval_odd_runs, num_data_oneval_odd_runs,
               num_squawk_zeroval_odd_runs, num_squawk_oneval_odd_runs); */
        print_odd_run_line (num_data_zeroval_odd_runs,
                            num_data_oneval_odd_runs,
                            num_squawk_zeroval_odd_runs,
                            num_squawk_oneval_odd_runs,
                            do_odd_run_fix,
                            0);
      }
      
      // now run this again (in PLL mode, when correcting them only)
      // so we get a printout of how well we did
      // (remember squawk spans will NOT have been fixed)
      if (1 == w) {
        num_data_zeroval_odd_runs   = 0;
        num_data_oneval_odd_runs    = 0;
        num_squawk_zeroval_odd_runs = 0;
        num_squawk_oneval_odd_runs  = 0;
        e = qb_spans_count_and_maybe_fix_odd_runs_by_atom_insert (*spans_out,
                                                                  *num_spans_out,
                                                                  srclen_smps,
                                                                  rate,
                                                                  0, // don't fix, just report
                                                                  &num_data_zeroval_odd_runs,
                                                                  &num_data_oneval_odd_runs,
                                                                  &num_squawk_zeroval_odd_runs,
                                                                  &num_squawk_oneval_odd_runs,
                                                                  0); // verbose = 0 (it's only relevant when fixing anyway)
        if (QB_E_OK == e) {
          /*
          printf("    Odd runs");
          if ( do_odd_run_fix ) {
            printf(" (after fix) ");
          }
          printf(": data: 0-val %lld, 1-val %lld; squawk: 0-val %lld, 1-val %lld.\n",
                 num_data_zeroval_odd_runs, num_data_oneval_odd_runs,
                 num_squawk_zeroval_odd_runs, num_squawk_oneval_odd_runs); */
          print_odd_run_line (num_data_zeroval_odd_runs,
                              num_data_oneval_odd_runs,
                              num_squawk_zeroval_odd_runs,
                              num_squawk_oneval_odd_runs,
                              do_odd_run_fix,
                              1);
        }
      }
      
    } // next pass (maybe)
    
    if (QB_E_OK != e) { break; }

    if (QB_E_OK != sanity_check_spans(*spans_out, *num_spans_out, 5)) { e = QB_E_BUG; break; }
    
    e = qb_fill_missing_atoms_on_leader_spans_and_adjust_limits (*spans_out,
                                                                *num_spans_out,
                                                                rate,
                                                                srclen_smps);
    if (QB_E_OK != e) { break; }
    
    if (QB_E_OK != sanity_check_spans(*spans_out, *num_spans_out, 6)) { e = QB_E_BUG; break; }

    if (inspect->enabled) {
      e = qb_inspect_append_span_types (*spans_out,
                                        srclen_smps,
                                        *num_spans_out,
                                        inspect);
      if (QB_E_OK != e) { break; } // inspect errors are now fatal
    }
    
    e = qb_create_two_atoms_on_silent_spans (*spans_out, *num_spans_out, verbose);
    if (QB_E_OK != e) { break; }
    
    e = qb_span_amalgamate_spans (*spans_out, num_spans_out);
    if (QB_E_OK != e) { break; }
    
    if (QB_E_OK != sanity_check_spans(*spans_out, *num_spans_out, 7)) { e = QB_E_BUG; break; }

  } while (0);
  
#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
  qb_vec_buf_finish (&(goertz_vec[0]));
  qb_vec_buf_finish (&(goertz_vec[1]));
  qb_vec_buf_finish (&freq_doubled_vec);
#else
  if (NULL != freq_doubled) {
    qb_free(freq_doubled);
  }
#endif

  free_pll_carriers(carriers);
  
  if (NULL != feats) { // if needed
    qb_free(feats);
    feats = NULL;
    num_feats = 0;
  }

  
  if (NULL != goertz[0]) {
    qb_free(goertz[0]);
  }
  if (NULL != goertz[1]) {
    qb_free(goertz[1]);
  }
  
  *max_powerX_out = max_powerX;
  
  return e;
  
}



static void print_odd_run_line (s64_t data0,
                                s64_t data1,
                                s64_t squawk0,
                                s64_t squawk1,
                                u8_t have_fix,
                                u8_t after_fix) {
  printf("    Odd runs");
  if ( have_fix ) {
    if ( after_fix ) {
      printf(" (after fix) ");
    } else {
      printf(" (before fix)");
    }
  }
  //printf(": data(0/1: %lld/%lld); squawk(0/1: %lld/%lld).\n",
  //       data0, data1, squawk0, squawk1);
  printf(": (data 0/1): %lld/%lld; (squawk 0/1): %lld/%lld.\n",
         data0, data1, squawk0, squawk1);
}

//#define QB_FREQ_DBL_INSPECT_SCALER 0.3f// 0.2f
#define QB_FREQ_DBL_INSPECT_SCALER 0.8f

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
static qb_err_t make_squared_pre_carrier_vector (qb_vec_buf_t *src_v,
                                                 qb_inspect_t *inspect,
                                                 u8_t dp, // display_progress
                                                 qb_vec_buf_t *squared_out) { // out
                                               
  // need any carriers? do freq doubling (once)
  // prep the squared version
  
  qb_err_t e;
  s16_t *dbg_doubled;
#ifdef QB_VECTORS_GCC_CLANG
  u8_t v;
#endif
  s64_t x;
  qb_vec_f_t twos; // ones;
  qb_vec_f_t halves;
  
  dbg_doubled = NULL;
  
  e = QB_E_OK;
  
  // duplicate src_v metadata onto squared, then just assign a fresh buffer
  *squared_out = *src_v;
  squared_out->v.f = qb_malloc (sizeof(qb_vec_f_t) * squared_out->alloc);
  
  //memset(&ones, 0, sizeof(qb_vec_f_t)); // compilers are
  memset(&halves, 0, sizeof(qb_vec_f_t)); // compilers are
  memset(&twos, 0, sizeof(qb_vec_f_t)); // genetically defective
  
#ifdef QB_VECTORS_GCC_CLANG
  for (v=0; v < QB_VECSIZE; v++) {
    //ones[v] = 1.0f;
    halves[v] = 0.5f;
    twos[v] = 2.0f;
  }
#else // MSVC_AVX2
  //ones = _mm256_set1_ps(1.0f);
  halves = _mm256_set1_ps(0.5f);
  twos = _mm256_set1_ps(2.0f);
#endif
  
  //do {

  printf("    Frequency doubling: ");
  qb_show_meter(dp);
  
  for (x = 0; x < src_v->alloc; x++) {
    qb_vec_f_t ps, dbl;
    ps = src_v->v.f[x];
#ifdef QB_VECTORS_GCC_CLANG
    //dbl = (((ps * ps) * twos) - halves);
    dbl = ((ps * ps) * twos);
    squared_out->v.f[x] = dbl;
#else // MSVC_AVX2
    //dbl = _mm256_fmsub_ps(_mm256_mul_ps(ps, ps), twos, ones);
    dbl = _mm256_mul_ps(_mm256_mul_ps(ps, ps), twos); //, ones);
    //dbl = _mm256_fmsub_ps(_mm256_mul_ps(ps, ps), twos), halves);
    squared_out->v.f[x] = dbl;
#endif
    qb_update_meter (dp, x, src_v->alloc, 1.0f, 0);
  }
  qb_hide_meter(dp, 0);
  printf("done.\n");
  //} while (0);
    
  if (inspect->enabled) {
    e = qb_vec_buf_unswizzle_to_s16 (squared_out, &dbg_doubled, QB_FREQ_DBL_INSPECT_SCALER);
    if (QB_E_OK == e) {
      e = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_PLL_DOUBLED,
                                     dbg_doubled,
                                     squared_out->linear_len);
      if (QB_E_OK != e) {
        fprintf(QB_ERR, "W: Error writing PLL freq-doubled inspection file.\n");
      }
    }
    qb_free(dbg_doubled);
  }
  
  if (QB_E_OK != e) {
    qb_vec_buf_finish(squared_out);
  }
  
  return e;
  
}

#else // ndef VECTORS_NEW

static qb_err_t make_squared_pre_carrier_scalar (float *src_f,
                                                 s64_t srclen_smps,
                                                 qb_inspect_t *inspect,
                                                 u8_t dp,
                                                 float **squared_out) { // out
                                               

  // need any carriers? do freq doubling (once)
  // prep the squared version
  
  qb_err_t e;
  s16_t *dbg_doubled;
  
  dbg_doubled = NULL;
  
  e = QB_E_OK;
  
  (*squared_out) = qb_malloc (sizeof(float) * srclen_smps);
  if (NULL == (*squared_out)) {
    fprintf(QB_ERR, "E: Out of memory allocating squared input buffer.\n");
    return QB_E_MALLOC;
  }
  
  do {
    s64_t x;
    if (inspect->enabled) {
      dbg_doubled = qb_malloc(sizeof(s16_t) * srclen_smps);
      if (NULL == dbg_doubled) {
        fprintf(QB_ERR, "E: qb_malloc(%lld) for dbg_doubled failed\n", srclen_smps);
        e = QB_E_MALLOC;
        break;
      }
    }
    printf("    Frequency doubling: ");
    qb_show_meter(dp);
    for (x=0; x < srclen_smps; x++) {
      float ps, dbl;
      ps = src_f[x];
      //dbl = (ps * ps) * 2.0f; //) - 1.0f);
      dbl = (((ps * ps) * 2.0f)); // - 0.5f);
      if (inspect->enabled) { // doubled is same for all phases, only do it once
        dbg_doubled[x] = qb_float_to_s16(dbl * QB_FREQ_DBL_INSPECT_SCALER);
      }
      (*squared_out)[x] = dbl;
      qb_update_meter (dp, x, srclen_smps, 1.0f, 0);
    }
    qb_hide_meter (dp, 0);
    printf("done.\n");
    if (inspect->enabled) {
      e = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_PLL_DOUBLED,
                                     dbg_doubled,
                                     srclen_smps);
      qb_free(dbg_doubled);
      dbg_doubled = NULL;
      if (QB_E_OK != e) {
        fprintf(QB_ERR, "E: Error writing PLL freq-doubled inspection file.\n");
        break;
      }
    }
  } while (0);
  
  if (QB_E_OK != e) {
    // catch
    qb_free(*squared_out);
    *squared_out = NULL;
  }
  
  return e;
  
}
#endif // ndef VECTORS_NEW



static void gauge_need_for_feats_and_carriers (u8_t sync_method,
                                               u8_t phase_scheme,
                                               u8_t phase_detect_method,
                                               u8_t *do_walk_phase_out,
                                               u8_t *do_pll_phase_out,
                                               u8_t *generate_features_out,
                                               u8_t generate_carriers_out[4],
                                               u8_t *need_freq_doubled_out) { // if at least one carrier is needed
                                               
  // logic gets a bit fiddly here ...
  // - if we're using either of the auto phase schemes (block or fixed), and
  //   phase is to be determined by PLL, we'll
  //   need to compute all four carriers.
  // - if we're not using PLL sync, we can go ahead
  //   and free all carriers after phase is determined.
  // - if we ARE using PLL sync, but with a fixed phase,
  //   then we'll need to either keep or generate the carrier
  //   for the phase needed.
  
  *do_walk_phase_out = 0;
  *do_pll_phase_out = 0;
  *generate_features_out = 0;
  *need_freq_doubled_out = 0;
  
  generate_carriers_out[0] = 0;
  generate_carriers_out[1] = 0;
  generate_carriers_out[2] = 0;
  generate_carriers_out[3] = 0;
  
  if ( (    (QB_PHASE_SCHEME_BLOCK_AUTO == phase_scheme)
         || (QB_PHASE_SCHEME_FIXED_AUTO == phase_scheme) ) ) {
            
    if (QB_PHASE_DETECT_METHOD_PLL == phase_detect_method) {
      *do_pll_phase_out = 1;
      generate_carriers_out[0] = 1;
      generate_carriers_out[1] = 1;
      generate_carriers_out[2] = 1;
      generate_carriers_out[3] = 1;
    } else if (QB_PHASE_DETECT_METHOD_WALK == phase_detect_method) {
      *do_walk_phase_out     = 1;
      *generate_features_out = 1;
    }
    
  }
  
  if (QB_SYNC_METHOD_PLL == sync_method) {
    if (QB_PHASE_SCHEME_FIXED_0 == phase_scheme) {
      generate_carriers_out[0] = 1;
    } else if (QB_PHASE_SCHEME_FIXED_90 == phase_scheme) {
      generate_carriers_out[1] = 1;
    } else if (QB_PHASE_SCHEME_FIXED_180 == phase_scheme) {
      generate_carriers_out[2] = 1;
    } else if (QB_PHASE_SCHEME_FIXED_270 == phase_scheme) {
      generate_carriers_out[3] = 1;
    } else {
      generate_carriers_out[0] = 1;
      generate_carriers_out[1] = 1;
      generate_carriers_out[2] = 1;
      generate_carriers_out[3] = 1;
    }
  }
  
  if (    generate_carriers_out[0]
       || generate_carriers_out[1]
       || generate_carriers_out[2]
       || generate_carriers_out[3]) {
    *need_freq_doubled_out = 1;
  }
  
  if (QB_SYNC_METHOD_WALK == sync_method) {
    *generate_features_out = 1;
  }
}



#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2
static qb_err_t generate_carriers_vector (qb_vec_buf_t *src_v,
                                          qb_vec_buf_t *src_freq_doubled_vec,
                                          u8_t generate_carriers[4],
                                          s32_t rate,
                                          qb_pll_carrier_t carriers_out[4],
                                          u8_t display_progress,
                                          qb_inspect_t *inspect) {

  u8_t phase_ix;
  u8_t rate_ix;
  qb_err_t e;
  
  e = QB_E_OK;
  
  rate_ix = qb_sample_rate_ix_from_rate (rate);
  
  for (phase_ix = 0; phase_ix < 4; phase_ix++) {

    qb_fir_vec_t fir_bp_2k4_vec;
  
    if ( generate_carriers [phase_ix] ) {

      e = qb_fir_vec_init (&fir_bp_2k4_vec, QB_FIR_TYPE_BP_2K4, rate_ix);
      if (QB_E_OK != e) { break; }
      e = qb_generate_pll_carrier_vector (src_v,
                                          src_freq_doubled_vec,
                                          //fir_bp_2k4_vec.delay,   // + delay_hack,
                                          get_carrier_shift_for_phase_ix_and_rate (phase_ix, rate),  // applied to the doubled signal before it is mixed with the raw
                                          phase_ix,  // used by inspect stuff
                                          &(carriers_out[phase_ix].signal),
                                          &fir_bp_2k4_vec,
                                          display_progress,
                                          inspect);
      if (QB_E_OK != e) { break; }
      
    } // endif (generate carrier for this phase)
  } // next phase
  
  if (QB_E_OK != e) {
    free_pll_carriers (carriers_out);
  }
  
//  printf ("carriers: %p, %p, %p, %p, e=%d\n",
//          carriers_out[0].signal,
//          carriers_out[1].signal,
//          carriers_out[2].signal,
//          carriers_out[3].signal,
//          e);
  
  return e;
  
}

#else // scalar

static qb_err_t generate_carriers_scalar (float *src_f,
                                          s64_t srclen_smps,
                                          float *src_freq_doubled,
                                          u8_t generate_carriers[4],
                                          s32_t rate,
                                          qb_pll_carrier_t carriers_out[4],
                                          u8_t display_progress,
                                          qb_inspect_t *inspect) {

  u8_t phase_ix;
  qb_fir_t fir_bp_2k4;
  u8_t rate_ix;
  qb_err_t e;
  
  e = QB_E_OK;
  
  rate_ix = qb_sample_rate_ix_from_rate (rate);
  
  for (phase_ix = 0; phase_ix < 4; phase_ix++) {
  
    if ( generate_carriers [phase_ix] ) {
      
      e = qb_fir_init (&fir_bp_2k4, QB_FIR_TYPE_BP_2K4, rate_ix);
      if (QB_E_OK != e) { break; }
      
      carriers_out[phase_ix].signal = qb_malloc(sizeof(float) * srclen_smps); // ( + delay ??)
      if (NULL == carriers_out[phase_ix].signal) {
        fprintf(QB_ERR, "E: Out of memory allocating PLL carrier buffer (phase %u).\n",
                qb_get_phase_for_phase_ix(phase_ix));
        e = QB_E_MALLOC;
        break;
      }
      
      e = qb_generate_pll_carrier_scalar (src_f,
                                          src_freq_doubled,
                                          srclen_smps,
                                          fir_bp_2k4.delay,   // + delay_hack,
                                          get_carrier_shift_for_phase_ix_and_rate (phase_ix, rate),  // applied to the doubled signal before it is mixed with the raw
                                          phase_ix, // used by inspect stuff
                                          //0.0001f,      // 0.0 means ultra clip, 1.0 means no clip
                                          carriers_out[phase_ix].signal,
                                          &fir_bp_2k4,
                                          display_progress,
                                          inspect);
      if (QB_E_OK != e) { break; }
      
    } // endif (generate carrier for this phase)
  } // next phase
  
  if (QB_E_OK != e) {
    free_pll_carriers (carriers_out);
  }
  
  return e;
  
}
#endif // not vectorised


// 1.9.7
static qb_err_t determine_span_phases (s64_t srclen_smps,
                                       qb_pll_carrier_t carriers[4],
                                       qb_wave_feature_t *feats,
                                       s64_t num_feats,
                                       s32_t num_spans,
                                       s32_t rate,
                                       u8_t do_pll_phase,
                                       u8_t do_walk_phase,
                                       u8_t phase_scheme,
                                       u8_t sync_method,
                                       u8_t phase_detect_method,
                                       u8_t display_progress,
                                       qb_span_t *spans_inout) { // modified
                                       
  qb_err_t e;
  
  e = QB_E_OK;
  
//printf("%s\n", QB_FUNC_M);

//  printf ("carriers: %p, %p, %p, %p\n",
//          carriers[0].signal,
//          carriers[1].signal,
//          carriers[2].signal,
//          carriers[3].signal);

  do {
  
    s8_t current_phase_ix;
    s32_t sn;
    u8_t w;
    u8_t fixed_phase;
    s32_t phase_scores[4];
    s32_t best_phase_score;
    
#ifdef QB_SANITY
    for (sn=0; sn < num_spans; sn++) {
      if (spans_inout[sn].detected_input_phase_ix != -1) {
        fprintf(QB_ERR, "B: %s: span %u starts out with valid phase ix (%u, should be -1)\n",
                QB_FUNC_M, sn, spans_inout[sn].detected_input_phase_ix);
        return QB_E_BUG;
      }
    }
#endif

    // write an invalid phase to all spans to start with
    //for (sn=0; sn < num_spans; sn++) {
    //  spans_inout[sn].detected_input_phase_ix = 0xff; // start with an invalid value
    //}
    
    // program in fixed phases ...
    if (    (QB_PHASE_SCHEME_FIXED_0   == phase_scheme)
         || (QB_PHASE_SCHEME_FIXED_90  == phase_scheme)
         || (QB_PHASE_SCHEME_FIXED_180 == phase_scheme)
         || (QB_PHASE_SCHEME_FIXED_270 == phase_scheme) ) {
      for (sn=0; sn < num_spans; sn++) {
        if (QB_PHASE_SCHEME_FIXED_0 == phase_scheme) {
          spans_inout[sn].detected_input_phase_ix = 0; // 0
        } else if (QB_PHASE_SCHEME_FIXED_90 == phase_scheme) {
          spans_inout[sn].detected_input_phase_ix = 1; // 90
        } else if (QB_PHASE_SCHEME_FIXED_180 == phase_scheme) {
          spans_inout[sn].detected_input_phase_ix = 2; // 180
        } else {
          spans_inout[sn].detected_input_phase_ix = 3; // 270
        }
      }
    }
    
//printf("do_pll_phase = %u, do_walk_phase = %u\n", do_pll_phase, do_walk_phase);

    // ... or fetch variable phases
    if (do_pll_phase) {
      e = qb_get_data_span_phases_by_pll (srclen_smps,
                                          spans_inout,
                                          num_spans,
                                          rate,
                                          display_progress,
                                          carriers);
    } else if (do_walk_phase) {
      qb_get_data_span_phases_by_feats (spans_inout,
                                        num_spans,
                                        feats,
                                        num_feats,
                                        (float) rate,
                                        display_progress);
    }
    
    if (QB_E_OK != e) { break; }
    
    if (QB_SYNC_METHOD_PLL != sync_method) {
      // we don't do any more work with the PLL,
      // only used to determine polarity,
      // so we can free the carriers early
      free_pll_carriers(carriers);
    }

    fixed_phase = 0;

    for (w=0; w < 4; w++) {
      phase_scores[w] = 0;
    }
    
    if (QB_PHASE_SCHEME_FIXED_AUTO == phase_scheme) {
      // go with the majority verdict
      for (sn=0; sn < num_spans; sn++) {
        if (qb_span_has_viable_phase (spans_inout+sn)) { // consider long data spans only
          (phase_scores[spans_inout[sn].detected_input_phase_ix])++;
        }
      }
      best_phase_score = 0;
      for (w=0; w<4; w++) {
        if (phase_scores[w] > best_phase_score) {
          best_phase_score = phase_scores[w];
          fixed_phase = w;
        }
      }
    }
    
    // for per-block phase mode, only data spans will have
    // had their phases set at this point.
    // since each leader has to have the same phase as
    // the following data span in order for the lead-in to
    // work correctly, we'll now work backwards through the spans
    // and copy phase from each data span to any unpopulated
    // spans that precede it.
    
    // we need a good initial value though, to use on the final
    // leader span (which comes after the final data span), so
    // we'll have to find that first
    
    current_phase_ix = 0;
    
    for (sn = (num_spans - 1); sn >= 0; sn--) {
      qb_span_t *span = spans_inout + sn;
      // find the last span that has an assigned phase
      if (-1 != span->detected_input_phase_ix) {
        // got it
        // this will be used as our initial value
        current_phase_ix = span->detected_input_phase_ix;
        break;
      }
    }
    
    // now walk the spans backwards and copy valid phase
    // values from data spans onto invalid preceding leader and silent spans
    // (and squawks)
    for (sn = (num_spans - 1); sn >= 0; sn--) {
      s8_t *p;
      p = &(spans_inout[sn].detected_input_phase_ix);
      if (-1 != *p) { // existing phase is valid, make a note of it
        current_phase_ix = *p;
      } else { // existing polarity is invalid, copy working value onto span
        *p = current_phase_ix;
      }
    }
    
#define QB_PHASES_PER_LINE 70

    if (    (QB_PHASE_SCHEME_BLOCK_AUTO == phase_scheme)
         || (QB_PHASE_SCHEME_FIXED_AUTO == phase_scheme) ) {
      printf("    Phases (%s):\n      ",
             (QB_PHASE_DETECT_METHOD_PLL == phase_detect_method) ? "PLL " : "walk");
      for (sn=0; sn < num_spans; sn++) {
        s8_t p;
        if ((sn>0) && (0==(sn % QB_PHASES_PER_LINE))) {
          printf("\n      ");
        }
        if (QB_SPAN_TYPE_SILENT == spans_inout[sn].type) {
          printf(" ");
        } else if (QB_SPAN_TYPE_LEADER == spans_inout[sn].type) {
          printf("_");
        } else {
          p = spans_inout[sn].detected_input_phase_ix;
#ifdef QB_SANITY
          if ((p>3) || (p<0)) {
            fprintf(QB_ERR, "B: %s: illegal phase_ix on span %d (%u)\n",
                    QB_FUNC_M, sn, p);
            e = QB_E_BUG;
            break;
          }
#endif
          printf("%c", qb_get_char_for_phase_ix(p));
        } // endif (not silent span)
      } // next span
      printf("\n");
    }
    
    if (QB_PHASE_SCHEME_FIXED_AUTO == phase_scheme) {
      for (sn=0; sn < num_spans; sn++) {
        spans_inout[sn].detected_input_phase_ix = fixed_phase;
      }
      printf("    Using majority span phase verdict: %u (%c)\n",
             qb_get_phase_for_phase_ix(fixed_phase),
             qb_get_char_for_phase_ix(fixed_phase));
    }
    
  } while (0);
  
//printf("%s return0r\n", QB_FUNC_M);
  
  return e;
  
}


static void free_pll_carriers (qb_pll_carrier_t carriers[4]) {
  u8_t i;
  for (i=0; i < 4; i++) {
    if (NULL != carriers[i].signal) { qb_free(carriers[i].signal); }
  }
  memset(carriers, 0, 4 * sizeof(qb_pll_carrier_t));
}


#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2

#include "fir_vec2.h"

static qb_err_t qb_run_prefilter_vector  (s32_t sample_rate,
                                          qb_vec_buf_t *vbuf,   // data modified (filtered)
                                          qb_inspect_t *inspect,
                                          u8_t display_progress) {

  qb_fir_vec_t fir_v;
  u8_t rate_type;
  qb_err_t e, e_dbg;
  
  s16_t *dbg_walk_filtered;
  
  dbg_walk_filtered = NULL;
  
  rate_type = qb_sample_rate_ix_from_rate (sample_rate);
  
  if (QB_FIR_RATE_UNK == rate_type) {
    fprintf(QB_ERR, "B: %s: illegal sample rate %u.\n",
            QB_FUNC_M,
            sample_rate);
    return QB_E_BAD_SAMPLE_RATE;
  }
  
  e = qb_fir_vec_init (&fir_v,
                       QB_FIR_TYPE_BP_0K4_3K2,
                       qb_sample_rate_ix_from_rate (sample_rate));
  if (QB_E_OK != e) { return e; }
  
  printf("  Filtering: ");
  fflush(stdout);
  
  qb_fir_vec_run (vbuf, &fir_v, display_progress);

  if (inspect->enabled) {
    // as for scalar, scale by 0.5 to avoid possible clamping if filter output overloads:
    e = qb_vec_buf_unswizzle_to_s16 (vbuf, &dbg_walk_filtered, 0.5f);
    if (QB_E_OK != e) { return e; }
    e_dbg = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_FILTERED,
                                       dbg_walk_filtered,
                                       vbuf->linear_len);
    if (QB_E_OK != e_dbg) {
      fprintf(QB_ERR, "E: Error writing walk mode filtered inspection file.\n");
    }
    qb_free(dbg_walk_filtered);
    dbg_walk_filtered = NULL;
    e = e_dbg;
  }

  return e;
  
}


static void normalise_vector (qb_vec_buf_t *vbuf, u8_t dp) { //display_progress
  s64_t i;
  float max;
  max = 0.0f;
  printf("    Normalise: ");
  qb_show_meter(dp);
  for (i = vbuf->zero_ix; i < (vbuf->zero_ix + vbuf->piece_len); i++) {
    u8_t j;
    for (j=0; j < QB_VECSIZE; j++) {
      float f;
#ifdef QB_VECTORS_GCC_CLANG
      f = fabsf(vbuf->v.f[i][j]);
#else // MSVC_AVX2  
      f = fabsf(((float *) &(vbuf->v.f[i]))[j]);
#endif
      if (f > max) {
        max = f;
      }
    }
    qb_update_meter (dp, i - vbuf->zero_ix, i - vbuf->zero_ix + vbuf->piece_len, 0.5f, 0);
  }
  for (i = 0; i < vbuf->alloc; i++) {
    u8_t j;
    for (j=0; j < QB_VECSIZE; j++) {
#ifdef QB_VECTORS_GCC_CLANG
      vbuf->v.f[i][j] /= max;
#else // MSVC_AVX2
      ((float *) &(vbuf->v.f[i]))[j] /= max;
#endif
    }
    qb_update_meter (dp, i, vbuf->alloc, 0.5f, 50);
  }
  qb_hide_meter (dp, 0);
  printf("done. Maximum amplitude was %.3lf.\n", max);
  if (max > 0.999f) {
    fprintf(QB_ERR, "    W: Input audio may be clipped.\n");
  }
}

#else // not vectorised

static void normalise_scalar (float *buf_inout, s64_t len, u8_t dp) {
  s64_t i;
  float max;
  max = 0.0f;
  printf("    Normalise: ");
  fflush(stdout);
  qb_show_meter(dp);
  for (i=0; i < len; i++) {
    float g;
    if (dp) {
      qb_update_meter (dp, i, len, 0.5f, 0);
    }
    g = fabsf(buf_inout[i]);
    if (g > max) { max = g; }
  }
  for (i=0; i < len; i++) {
    buf_inout[i] /= max;
    qb_update_meter (dp, i, len, 0.5f, 50);
  }
  qb_hide_meter (dp, 0);
  printf("done. Maximum amplitude was %.3f.\n", max);
  if (max > 0.999f) {
    fprintf(QB_ERR, "    W: Input audio may be clipped.\n");
  }
}


static qb_err_t qb_run_prefilter_scalar (s32_t sample_rate,
                                         float *src_f,         // data modified (filtered)
                                         s64_t *srclen_inout,  // modified (shortened by filter delay)
                                         qb_inspect_t *inspect,
                                         u8_t dp) { // display_progress

  qb_fir_t fir;
  u8_t rate_type;
  qb_err_t e, e_dbg;
  s64_t n;
  
  s16_t *dbg_walk_filtered;
  
  dbg_walk_filtered = NULL;
  
  rate_type = qb_sample_rate_ix_from_rate (sample_rate);
  
  if (QB_FIR_RATE_UNK == rate_type) {
    fprintf(QB_ERR, "B: %s: illegal sample rate %u.\n",
            QB_FUNC_M,
            sample_rate);
    return QB_E_BAD_SAMPLE_RATE;
  }
  
  e = qb_fir_init (&fir,
                   QB_FIR_TYPE_BP_0K4_3K2,
                   qb_sample_rate_ix_from_rate (sample_rate));
  if (QB_E_OK != e) { return e; }

  if (inspect->enabled) {
    dbg_walk_filtered = qb_malloc(sizeof(s16_t) * *srclen_inout);
    if (NULL == dbg_walk_filtered) {
      fprintf(QB_ERR, "E: Failed to qb_malloc(%lld) for walk-filtered inspection file.\n",
              *srclen_inout);
      return QB_E_MALLOC;
    }
  }
  
  printf("  Filtering: ");
  fflush(stdout);
  qb_show_meter(dp);
  
  // front-load the filter
  for (n=0; n < fir.delay; n++) {
    qb_fir_in (&fir, src_f[n]);
  }

  for (n = fir.delay; n < *srclen_inout; n++) {
    // move it backwards in the buffer as we go
    // by filter_delay_smps
    qb_fir_in (&fir, src_f[n]);
    src_f[n - fir.delay] = qb_fir_out(&fir);
    if (inspect->enabled) {
      // halve here to avoid qb_double_to_s16() clamping; filter output
      // is sometimes excessive
      dbg_walk_filtered[n - fir.delay] = qb_float_to_s16 (0.5f * src_f[n - fir.delay]);
    }
    qb_update_meter (dp, n, *srclen_inout, 1.0f, 0);
  }
  qb_hide_meter(dp, 0);
  printf("done.\n");

  // compensate for filter delay
  (*srclen_inout) -= fir.delay;
      
  if (inspect->enabled) {
    e_dbg = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_FILTERED,
                                       dbg_walk_filtered,
                                       *srclen_inout);
    if (QB_E_OK != e_dbg) {
      fprintf(QB_ERR, "E: Error writing walk mode filtered inspection file.\n");
    }
    qb_free(dbg_walk_filtered);
    dbg_walk_filtered = NULL;
    e = e_dbg;
  }

  return e;
  
}

#endif // not vectorised

