/*
 *  Quadbike 2
 *  Copyright (C) 2025 'Diminished'

 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.

 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.

 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/

#include "sync_pll.h"
#include "pll.h"
#include "util.h"
#include "qbio.h"
#include "span.h"
#include "inspect.h"


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

static void span_mark_transient_pll_lock_dropouts (qb_span_t *span); //, s32_t span_ix);

static float get_pll_phase_trigger_value_for_phase_ix (u8_t phase_ix, u8_t *wrap_out);

static qb_err_t pll_span_get_cycs (qb_pll_carrier_t *carrier,
                                   s64_t input_len,
                                   s64_t span_len,    // span length
                                   s64_t span_start_smps,
                                   u8_t reversed,
                                   s32_t sample_rate,
                                   qb_pll_t *pll,
                                   u8_t phase_ix,
                                   qb_atom_t **cycles_out, // or NULL
                                   s32_t *num_cycles_out,   // or NULL
                                   qb_inspect_t *inspect,
                                   float *lock_average_per_sample_out,
                                   u8_t display_progress);

qb_err_t qb_get_data_span_phases_by_pll (s64_t srclen,
                                         qb_span_t *spans,
                                         s32_t num_spans,
                                         s32_t rate,
                                         u8_t dp, // display progress
                                         qb_pll_carrier_t carriers[4]) {
  
  s32_t i;
  qb_err_t e;
  u8_t phase_ix;
  u8_t rate_type;
  float cur_speed;
  qb_fir_t firs[4]; // 0, 90, 180, 270
  
  rate_type = qb_sample_rate_ix_from_rate (rate);
  
  for (phase_ix=0; phase_ix < 4; phase_ix++) {
    e = qb_fir_init (firs + phase_ix, QB_FIR_TYPE_BP_2K4, rate_type);
    if (QB_E_OK != e) { return e; }
  }

  cur_speed = spans[0].speed;
  
  printf("    Measuring span phases (PLL method): ");
  fflush(stdout); // MacOS
  qb_show_meter(dp);

  //e = QB_E_OK;
  for (i=0; i < num_spans; i++) {
  
    qb_span_t *span;
    qb_pll_t plls[4];   // 0, 90, 180, 270
    s64_t j;
    u8_t lock_pll;
    double lock_sums[4]; // 0, 90, 180, 270
    u8_t w;
    double best_lock_sum;
    u8_t best_phase_ix;
    
    span = spans + i;
    lock_pll = 0;
    
    if (    (i==0)
         || (QB_SPAN_TYPE_LEADER == span->type) ) {
      // first span, or start of leader span: lock PLL
      lock_pll = 1;
      if ( (i < (num_spans-1)) && (QB_SPAN_TYPE_DATA == (span+1)->type) ) {
        // but use the tape speed from the following (data) span:
        cur_speed = (span+1)->speed;
      }
    }
    
    for (phase_ix=0; phase_ix < 4; phase_ix++) {
    
      qb_pll_carrier_t *carrier;
      
      if (lock_pll) {
        qb_pll_init  (plls + phase_ix,
                      rate,
                      TEST_LOOP_FILTER_CUTOFF_HZ,
                      TEST_LOCK_FILTER_CUTOFF_HZ,
                      TEST_LOOP_GAIN,
                      QB_FREQ_2 * cur_speed,
                      QB_LOCK_THRESH);
      }

      lock_sums[phase_ix] = 0.0f;
      
      carrier = carriers + phase_ix;
      
      // sanity
      if (NULL == carrier->signal) {
        fprintf(QB_ERR, "\nB: %s: carrier->signal is NULL for phase_ix %u\n", QB_FUNC_M, phase_ix);
        return QB_E_BUG;
      }
      
      for (j = span->start; j < (span->start + span->len); j++) {
        s64_t m;
        float v;
        // distinct delay values for each polarity, and each playback direction
        // (forwards playback is [0])
        m = j; // we don't actually need bonus delay, since we're not recovering any data yet
        v = 0.0f;
        if (m < srclen) { // ensure delay doesn't put us outside the input buffer
          v = carrier->signal[m];
        }
        qb_pll_process (plls + phase_ix, v);
        lock_sums[phase_ix] += plls[phase_ix].pll_lock;
      } // next sample
      
//printf("\nlock_sums[%u] = %lf\n", pol, lock_sums[pol]);
      
    } // next phase
    
    //if (QB_E_OK != e) { break; }
    
    // we can only determine phase if we have
    // a mix of 1 and 0, so use current_phase_ix
    // variable and set it whenever we have the opportunity
    if (QB_SPAN_TYPE_DATA == span->type) {
      // find best phase
      best_lock_sum = 0.0f;
      best_phase_ix = 0;
      for (w=0; w < 4; w++) {
        if (lock_sums[w] > best_lock_sum) {
          best_lock_sum = lock_sums[w];
          best_phase_ix = w;
        }
      }
      span->detected_input_phase_ix = best_phase_ix;
    }
    
    qb_update_meter (dp, i, num_spans, 1.0f, 0);
    
  } // next span
  
  
  /*
#ifdef QB_SANITY
  // check no spans made it through the net
  for (i=0; i < num_spans; i++) {
    if (0xff == spans[i].detected_input_phase_ix) {
      // eep
      fprintf(QB_ERR, "B: unresolved phase on span %d\n", i);
      return QB_E_BUG;
    }
  }
#endif
  */
  
  //if (QB_E_OK == e) {
  qb_hide_meter(dp, 0);
  printf("done.\n");
  //}
  
  return QB_E_OK;
  
}


static float get_pll_phase_trigger_value_for_phase_ix (u8_t phase_ix, u8_t *wrap_out) {
  *wrap_out = 0;
  if (0 == phase_ix) { // 0
    return 0.5f; // 0.47?
  } else if (1 == phase_ix) { // 90
    return 0.75f; // 0.73?
  } else if (2 == phase_ix) { // 180
    *wrap_out = 1;
    return NAN; // this one wants to occur when phase wraps 1 -> 0, so we use a sentinel value instead
  } else { // 270
    return 0.25f; // 0.23?
  }
}

qb_err_t qb_derive_sync_pll (s64_t full_len,
                             s32_t sample_rate,
                             qb_span_t *spans,
                             s32_t num_spans,
                             qb_pll_carrier_t carriers[4],
                             qb_inspect_t *inspect,
                             u8_t dp, // display_progress
                             u8_t verbose) {

  qb_err_t e;
  s32_t sn;
  qb_pll_t pll;
  u8_t progress_display_has_begun;
  
//printf("carriers: %p, %p, %p, %p\n", carriers[0].signal, carriers[1].signal, carriers[2].signal, carriers[3].signal);
  
  e = QB_E_OK;
  
  // run the PLL for all spans now
  if ( (NULL == spans) || (0 == num_spans) ) {
    fprintf(QB_ERR, "B: %s: no spans\n", QB_FUNC_M);
    return QB_E_BUG;
  }
  
  progress_display_has_begun = 0;
  
  // initial PLL setup: use speed from span 0
  qb_pll_init (&pll,
               sample_rate,
               TEST_LOOP_FILTER_CUTOFF_HZ,
               TEST_LOCK_FILTER_CUTOFF_HZ,
               TEST_LOOP_GAIN,
               QB_FREQ_2 * spans[0].speed,
               QB_LOCK_THRESH);

  for (sn=0; sn < num_spans; sn++) {
  
    s64_t z;
    qb_pll_t reverse_pll;
    float lock_accumulator;
    qb_span_t *span;
    u8_t prev_span_type, next_span_type;
    u8_t reversal;
    
    lock_accumulator = 0.0f;
  
    span = spans + sn;
    
    prev_span_type = (sn>0)                 ? (span-1)->type : QB_SPAN_TYPE_INVALID;
    next_span_type = (sn < (num_spans - 1)) ? (span+1)->type : QB_SPAN_TYPE_INVALID;
    
    reversal = 0;
    
#ifdef QB_SANITY
    if ((span->detected_input_phase_ix < 0) || (span->detected_input_phase_ix > 3)) {
      fprintf(QB_ERR, "B: %s: span %u has illegal phase ix (%d)\n",
              QB_FUNC_M, sn, span->detected_input_phase_ix);
      return QB_E_BUG;
    }
#endif
    
    // this is intended for dealing with CFS bugfix squawks,
    // but could in theory apply to any situation where silence
    // immediately precedes a data span, without any leader
    // in between. therefore we don't query qb_is_squawk()
    // any more here. the check is simply based on span types:
    // [N-1] silence
    // [N]   data    <- you are here
    // [N+1] leader
    // - this scenario entails using a reversed PLL to lock onto leader at span N+1,
    //   then transcribing span N backwards.
    
    if (    /* ! pll.locked // ???
         && */ (QB_SPAN_TYPE_SILENT == prev_span_type)
         && (QB_SPAN_TYPE_DATA   == span->type)
         && (QB_SPAN_TYPE_LEADER == next_span_type) ) {
         
      // SATANIC BACKMASKING (for Satanic it is)
      
      reversal = 1;
      
    }
    
    if ( ! reversal ) {
    
      // NOT REVERSED
      // normal behaviour
      
      // leader, then data?
      if ( (QB_SPAN_TYPE_LEADER == span->type) && (QB_SPAN_TYPE_DATA == next_span_type) ) {
      
        // reinitialise the PLL with the tapespeed of the subsequent data span
        qb_pll_init  (&pll,
                      sample_rate,
                      TEST_LOOP_FILTER_CUTOFF_HZ,
                      TEST_LOCK_FILTER_CUTOFF_HZ,
                      TEST_LOOP_GAIN,
                      QB_FREQ_2 * (span+1)->speed,
                      QB_LOCK_THRESH);
        
      } // endif (leader then data)
      
      // we only need cycles for data spans, but we must run this for leader spans too,
      // so that the PLL remains continuous across the partition
      e = pll_span_get_cycs (carriers + span->detected_input_phase_ix,
                             full_len,
                             span->len,
                             span->start,
                             0, // don't reverse input
                             sample_rate,
                             &pll,
                             span->detected_input_phase_ix,
                             // don't bother collecting cycles for non-data spans:
                             (span->type == QB_SPAN_TYPE_DATA) ? (&(span->atoms))     : NULL,
                             (span->type == QB_SPAN_TYPE_DATA) ? (&(span->num_atoms)) : NULL,
                             inspect,
                             &lock_accumulator,
                             dp);

    } else {
    
      // REVERSED
    
      // FIXME: move this stupidity into a function
      
      qb_pll_carrier_t *carrier;
      
      carrier = carriers + span->detected_input_phase_ix; // pick appropriate carrier for phase
      
      // make a special reverse PLL
      qb_pll_init  (&reverse_pll,
                    sample_rate,
                    TEST_LOOP_FILTER_CUTOFF_HZ,
                    TEST_LOCK_FILTER_CUTOFF_HZ,
                    TEST_LOOP_GAIN,
                    QB_FREQ_2 * span->speed, // use data span's speed to set PLL
                    QB_LOCK_THRESH);

      // lock PLL onto (leader) span1, the *next* span, played backwards.
      // this won't be written into the inspect files now;
      // instead these *leader* PLL values will be written into inspect files
      // on the *next* span (when they are played
      // traditionally, i.e. forwards) -- hence we use
      // qb_pll_process() directly, rather than calling
      // pll_span_get_cycs(), which would write
      // the inspection data
      for (z=0; z < (span+1)->len; z++) {
        s64_t h;
        float v;
        h = ((span+1)->len - 1) - (z);
        if ((h < 0) || (h >= full_len)) {
          v = 0.0f;
        } else {
          h += (span+1)->start;
          v = carrier->signal[h];
//if (z == ((span+1)->len - 1)) {
//  printf("reversed: final (reversed) leader smpnum is %lld\n", h);
//}
        }
        // use same phase as current (data) span
        qb_pll_process (&reverse_pll, v);
      }
      
      // lock (hopefully) established; now process the squawk span (backwards)
      // -- inspection data for this data span will be written now
      e = pll_span_get_cycs (carrier, // FIXME: carrier is deprecated, just pass signal instead
                             //reversed_pll_delay,
                             full_len,
                             span->len,
                             span->start,
                             1, // reverse input
                             sample_rate,
                             &reverse_pll,
                             span->detected_input_phase_ix,
                             &(span->atoms),
                             &(span->num_atoms),
                             inspect,
                             &lock_accumulator,
                             dp);
      if (QB_E_OK != e) { break; }
      
    } // endif (reversed)
    
    if (QB_E_OK != e) { break; }
    
    span->pll_lock_quality = lock_accumulator;
    
    // do this here to prevent the "inspect -> /x/y" lines from pll_span_get_cycs()
    // from screwing up the progress indicator
    if ( ! progress_display_has_begun ) {
      printf("    Extracting sync (PLL): ");
      fflush(stdout); // MacOS
      qb_show_meter(dp);
    }
    progress_display_has_begun = 1;
    
    if (num_spans > 0) {
      qb_update_meter (dp, sn, num_spans, 1.0f, 0);
      fflush(stdout); // MacOS
    }

    span_mark_transient_pll_lock_dropouts (span); //, sn);
    
  } // next span
  
  if (progress_display_has_begun && (QB_E_OK == e)) {
    qb_hide_meter (dp, 0);
    printf ("done.\n");
    fflush(stdout); // MacOS
  }
  
  if (QB_E_OK != e) { return e; }
  
  for (sn=0; sn < num_spans; sn++) {
    if (    (QB_SPAN_TYPE_DATA == spans[sn].type)
         && verbose
         && spans[sn].transient_pll_lock_dropout
         && ! qb_is_squawk (spans+sn, (sn>0) ? (spans + (sn-1)) : NULL)) {
      fprintf(QB_ERR, "    W: [%lld] data span #%d suffers transient PLL lock dropout\n",
              qb_get_span_start_accurate_or_rough(spans + sn), sn);
    }
  }

  return e;
  
}



// based closely on the function below, pll_span_get_cycs()
qb_err_t qb_pll_span_shift_search (float *carrier_signal,
                                   float *goertz[2],
                                    s64_t input_len,
                                    qb_span_t *leader_span,
                                    qb_span_t *data_span,
                                    u8_t reversed,
                                    s32_t sample_rate,
                                    qb_pll_t pll, // use a copy, so we don't disturb the PLL
                                    u8_t phase_ix,
                                    s8_t *best_delay_out,
                                    u8_t method) {
                                    
  s8_t delay;
  qb_pll_t pll_copy;
  float best_worst_confidence; // not used
  double best_confidence_sum;
  s8_t best_delay;
  s64_t total_len_smps;

  // this length holds, regardless of which way round the processing is done:
  total_len_smps = leader_span->len + data_span->len;
  
  *best_delay_out = 0;
  best_worst_confidence = 0.0f;  // not used
  best_confidence_sum = 0.0f;
  
  if (QB_PLL_DELAY_SEARCH_NONE == method) {
    // just return 0
    return QB_E_OK;
  }

#ifdef QB_SANITY
  if (sample_rate < 100.0f) {
    fprintf(QB_ERR, "B: %s: nonsensical sample rate %d.\n", QB_FUNC_M, sample_rate);
    return QB_E_BUG;
  }
#endif
  if (NULL == carrier_signal) {
    fprintf(QB_ERR, "B: %s: NULL carrier passed in.\n", QB_FUNC_M);
    return QB_E_BUG;
  }
  
  best_delay = 0;

  for (delay = QB_PLL_DELAY_SEARCH_LOWER_BOUND;  // -3
       delay <= QB_PLL_DELAY_SEARCH_UPPER_BOUND; // +3
       delay++) {
  
    s64_t sn;
    double prev_pll_phase;
    float worst_confidence;
    double confidence_sum;
  
    confidence_sum = 0.0f;
    prev_pll_phase = 0.0f;
    worst_confidence = INFINITY;
    
    // get a fresh copy of the PLL state
    pll_copy = pll;
    
// FIXME: use something better than this, related to sample rate
#define PLL_DELAY_SEARCH_SPAN_MAX_SMPS 100000000

    // only search a certain distance into the span, to speed things up
    for (sn=0; sn < total_len_smps; sn++) {
    
      float v;
      double pll_phase;
      float confidence;
      float phase_trigger;
      u8_t measure_power_this_sample;
      s64_t start;
      u8_t wrap;
      
      // "zero-indexed sample number" (maybe reversed)
      s64_t zisn;
      // "zero-indexed sample number, delayed" (also maybe reversed)
      s64_t zisn_d;
      // "absolute sample number"
      s64_t asn;
      // "absolute sample number, delayed"
      s64_t asn_d;
      
      if ( ! reversed ) {
        zisn   = sn;
        zisn_d = zisn + (s64_t) delay;
      } else {
        zisn = total_len_smps - (1 + sn);
        zisn_d = zisn - (s64_t) delay; // delay now acts backwards
      }
      
      // normal:   leader, data
      // reversed: data, leader
      start = (reversed ? data_span->start : leader_span->start);
      
      asn   = zisn   + start;
      asn_d = zisn_d + start;
      
      // delay is applied to the carrier signal fetch ...
      if ((asn_d < 0) || (asn_d >= input_len)) {
        // delay moves us outside buffer, use zero
        //v = 0.0;
        v = 0.0f;
      } else {
        v = carrier_signal[asn_d];
      }
      
      pll_phase = qb_pll_process (&pll_copy, v);
      
      // for the first portion of this loop, we're not
      // interested in measuring anything, since it's
      // just leader (regardless of whether we're working
      // forwards or backwards) -- so once the PLL has seen
      // the rabbit, we're done
      if (sn < leader_span->len) {
        continue;
      }

      // on the very first cycle, we don't have any meaningful prior phase
      // record in prev_pll_phase yet, so it will always generate
      // a cycle. particulaly in the reverse squawk case, we really
      // do not want this, as it will generate a fake cycle at the *end* of the
      // squawk data span.
      
      // so we have this check (sn>0), to ensure that cycles are only generated
      // once we have a meaningful record of the phase on the prior sample.
      //measure_power_this_sample =    (prev_pll_phase < pll_phase_at_bit_transitions)
      //                            && (pll_phase >= pll_phase_at_bit_transitions)
      //                            && (sn>0);
      
      wrap = 0;
      phase_trigger = get_pll_phase_trigger_value_for_phase_ix (phase_ix, &wrap);
      
      // once we have a meaningful record of the phase on the prior sample.
      //measure_power_this_sample =    (prev_pll_phase < pll_phase_at_bit_transitions)
      //                            && (pll_phase >= pll_phase_at_bit_transitions)
      //                            && (sn>0);
      
      if (0==sn) {
        measure_power_this_sample = 0;
      } else if (wrap) {
        measure_power_this_sample = (pll_phase < prev_pll_phase);
      } else {
        measure_power_this_sample =    (prev_pll_phase < phase_trigger)
                                    && (pll_phase >= phase_trigger);
      }
      
      if (measure_power_this_sample) {
        confidence = fabsf (goertz[0][asn] - goertz[1][asn]); // asn, not asn_d -- delay only applies to the carrier fetch
        confidence_sum += confidence;
        if (confidence < worst_confidence) {
          worst_confidence = confidence;
        }
      }
      
      prev_pll_phase = pll_phase;
      
    } // next sample
    
    if (QB_PLL_DELAY_SEARCH_BEST_WORST == method) {
      // now find the best of the worst, so to speak
      if (worst_confidence > best_worst_confidence) {
        best_worst_confidence = worst_confidence;
        best_delay = delay;
      }
    } else if (QB_PLL_DELAY_SEARCH_CONFIDENCE_SUM == method) {
      if (confidence_sum > best_confidence_sum) {
        best_confidence_sum = confidence_sum;
        best_delay = delay;
      }
    } else {
      best_delay = 0;
    }
    
  } // next trial delay value

  *best_delay_out = best_delay;
  
 // printf ("data span #%d; best delay was %d\n", data_span_ix, best_delay);
  
  return QB_E_OK;
  
}


static void span_mark_transient_pll_lock_dropouts (qb_span_t *span) {
  s32_t an;
  if (0 == span->num_atoms) { return; }
  for (an=0; an < span->num_atoms; an++) {
    if (span->atoms[an].pll_lock < QB_LOCK_THRESH_TRANSIENT) {
      span->transient_pll_lock_dropout = 1;
    }
  }
}




#define QB_CARRIER_FILTERED_SCALER 0.5f// 0.2fs

#if defined QB_VECTORS_GCC_CLANG || defined QB_VECTORS_MSVC_AVX2

#include "fir_vec2.h"

// rework of qb_generate_pll_carrier_scalar() above
qb_err_t qb_generate_pll_carrier_vector  (qb_vec_buf_t *buf_v,             // input
                                          qb_vec_buf_t *squared,
                                          s8_t phase_shift_smps,
                                          u8_t phase_ix,
//                                          float clipping_level,        // 0.0 means ultra clip, 1.0 means no clip
                                          float **clipped_carrier_out, // FIXME: if PLL is vectorised, this will need to be a vecbuf instead
                                          qb_fir_vec_t *bp_2k4_vec_p,
                                          u8_t dp, // display_progress
                                          qb_inspect_t *inspect) {   // or NULL

  s64_t q;
  s16_t *dbg_carrier;
  u8_t do_inspect;
  qb_vec_buf_t mixed;
  qb_vec_f_t zeros; //, ones, twos, halves;
#ifdef QB_VECTORS_GCC_CLANG
  u8_t v;
#endif
  qb_err_t e;
  s16_t *mixed_dbg;
#ifdef QB_VECTORS_MSVC_AVX2
  qb_vec_f_t small_half;
  qb_vec_f_t two, one;

  small_half = _mm256_set1_ps(0.48f);
  two        = _mm256_set1_ps(2.0f);
  one        = _mm256_set1_ps(1.0f);
#endif
  
  dbg_carrier = NULL;

  mixed_dbg = NULL;
  
  do_inspect = (NULL != inspect) && inspect->enabled;
  
  memset (&zeros, 0, sizeof(qb_vec_f_t)); // compiler is thick
  
#ifdef QB_VECTORS_GCC_CLANG
  for (v=0; v < QB_VECSIZE; v++) {
    zeros[v] = 0.0f;
  }
#else // MSVC_AVX2
  zeros = _mm256_setzero_ps();
#endif
 
  printf("    Phase %u (%c):\n",
         qb_get_phase_for_phase_ix(phase_ix),
         qb_get_char_for_phase_ix(phase_ix));
  fflush(stdout);

  
  printf("      Mixing pre-carrier (shift %d samples): ", phase_shift_smps);
  fflush(stdout);
  qb_show_meter(dp);

  // duplicate the input vector buffer's metadata, then give it a fresh buffer
  mixed = *buf_v;
  mixed.v.f = qb_malloc (sizeof(qb_vec_f_t) * buf_v->alloc);

//qb_vec_buf_debug_print(squared);
  
  // *****************
  // 1. MIX
  // *****************
  
  for (q=0; q < buf_v->alloc; q++) {
  
    s64_t i;
    qb_vec_f_t mix;
    
    i = q + phase_shift_smps;
    
    if ((i>0) && (i < squared->alloc)) {
      
#ifdef QB_VECTORS_GCC_CLANG
      mix = (0.48f * squared->v.f[i]) + (0.48f * buf_v->v.f[q]);
#else // MSVC_AVX2
      mix = _mm256_fmadd_ps(small_half, buf_v->v.f[q], _mm256_mul_ps(small_half, squared->v.f[i]));
#endif

    } else {
      mix = zeros; // shifted outside buffer, just use 0
    }

    mixed.v.f[q] = mix;
    
    qb_update_meter (dp, q, buf_v->alloc, 1.0f, 0);
    
  }
  
  qb_hide_meter (dp, 0);
  printf("done.\n");
  
//exit(0);

  if (do_inspect) {

    e = qb_vec_buf_unswizzle_to_s16 (&mixed, &mixed_dbg, QB_CARRIER_FILTERED_SCALER);
    if (QB_E_OK != e) {
      qb_vec_buf_finish (&mixed);
      return e;
    }
    
//exit(0);
  
    e = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_PLL_MIX + phase_ix,
                                   mixed_dbg,
                                   mixed.linear_len);
    qb_free(mixed_dbg);
    mixed_dbg = NULL;
    
//exit(0);
    
    if (QB_E_OK != e) {
      fprintf(QB_ERR, "W: Error writing PLL mix (doubled + raw) inspection file.\n");
      qb_vec_buf_finish (&mixed);
      return e;
    }
    
  } // endif (do_inspect)
  
  
  // *********
  // 2. FILTER
  // *********

  printf("      Extracting carrier: ");
  fflush(stdout);
  qb_fir_vec_run (&mixed, bp_2k4_vec_p, dp);
  
  if (inspect->enabled) {
    e = qb_vec_buf_unswizzle_to_s16 (&mixed, &dbg_carrier, 1.0f);
    if (QB_E_OK != e) {
      qb_vec_buf_finish(&mixed);
      return e;
    }
    e = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_PLL_CARRIER + phase_ix,
                                   dbg_carrier,
                                   mixed.linear_len);
    qb_free(dbg_carrier);
    if (QB_E_OK != e) {
      qb_vec_buf_finish(&mixed);
      return e;
    }
  }

  // clip
  printf("      Clipping: ");
  fflush(stdout);
  qb_show_meter(dp);
#ifdef QB_VECTORS_GCC_CLANG
  for (q=0; q < mixed.alloc; q++) {
    qb_vec_i_t negative;
    qb_vec_f_t negative_float;
    negative = (mixed.v.f[q] < 0.0f);
    negative_float = __builtin_convertvector (negative, qb_vec_f_t);
    mixed.v.f[q] = (negative_float + 0.5f) * 2.0f;
    qb_update_meter (dp, q, mixed.alloc, 1.0f, 0.0f);
  }
#else // MSVC_AVX2
  for (q = 0; q < mixed.alloc; q++) {
    qb_vec_i_t negative;
    qb_vec_f_t negative_float;
    // hmm. _mm256_cmp_ps() returns 0xffffffff if mixed.v.f[q] < 0.
    // this is apparently NaN in floating-point ...
    negative_float = _mm256_cmp_ps(mixed.v.f[q], zeros, _CMP_LT_OS);
    // stupidity: cast float results (all 1s if <0, =NaN) to integer (-1):
    negative = *((qb_vec_i_t *) (&negative_float));
    // now convert it "properly" back to floating point, yielding -1.0f if <0
    negative_float = _mm256_cvtepi32_ps(negative);
    // there's a multiply-and-add instruction, but no add-and-multiply instruction
    // so instead of doing (a + 0.5) * 2, we'll do (a * 2) + 1
    mixed.v.f[q] = _mm256_fmadd_ps(negative_float, two, one);
    qb_update_meter(dp, q, mixed.alloc, 1.0f, 0);
  }
#endif
  qb_hide_meter (dp, 0);
  printf("done.\n");
  
  e = qb_vec_buf_unswizzle (&mixed, clipped_carrier_out, /*NULL,*/ dp, "      ", "");
//printf("clipped_carrier_out = %p\n", clipped_carrier_out);
  if (QB_E_OK != e) {
    qb_vec_buf_finish(&mixed);
    return e;
  }
  //if (display_progress) { qb_hide_meter(0); }
  //printf("done.\n");
  
  qb_vec_buf_finish (&mixed);
  
  return QB_E_OK;
  
}
#else // no vectors
qb_err_t qb_generate_pll_carrier_scalar  (float *buf_f,             // input
                                          float *buf_squared_f,     // pre-squared: ((v * v) * 2.0f) - 1.0f
                                          s64_t full_len,           // input length
                                          s64_t filter_delay,       // FIXME: redundant; just get it from bp_2k4_p->delay
                                          s8_t phase_shift_smps,
                                          u8_t phase_ix,
                                          //float clipping_level,      // 0.0 means ultra clip, 1.0 means no clip
                                          float *buf_carrier_clipped_out,
                                          qb_fir_t *bp_2k4_p,
                                          u8_t dp, //display_progress,
                                          qb_inspect_t *inspect) {   // or NULL

  s64_t n, q;
  qb_err_t e_dbg;
  s16_t *dbg_carrier;
  s16_t *dbg_mix;
  u8_t do_inspect;
  
  dbg_carrier = NULL;
  dbg_mix     = NULL;
  
  do_inspect = (NULL != inspect) && inspect->enabled;
  
/*
 * carrier extraction (attempt 2)
 * I seem to have ended up settling on just this:
 *
 *
 * f(n) -> f.doubler -> ph.shift -> [+] ---> bpass2400 -> clip ----> carrier_clipped
 *  |               'dbl'       'ps' ^  'mix'    |   'flt'
 *  |                                |           |
 *  +--------------->----------------+           +-------->--------> carrier (unused)
 *                'raw'
 *
 *  pll_mutant_doubled.wav is 'dbl'
 *  pll_mutant_carrier.wav is 'flt'
 *
 */
 
  printf("    Phase %u (%c):\n",
         qb_get_phase_for_phase_ix(phase_ix),
         qb_get_char_for_phase_ix(phase_ix));
  fflush(stdout);
  
  // zero-out mutants
  for (n=0; n < full_len; n++) {
    buf_carrier_clipped_out[n] = 0.0f;
  }
  
  if (do_inspect) {
    //if ( 0 == phase_ix ) { // same for all phases, only write inspect for first phase
    //  dbg_doubled = qb_malloc(sizeof(s16_t) * full_len);
    //  if (NULL == dbg_doubled) {
    //    fprintf(QB_ERR, "E: qb_malloc(%lld) for dbg_doubled failed\n", full_len);
    //    return QB_E_MALLOC;
    //  }
    //}
    dbg_mix = qb_malloc(sizeof(s16_t) * full_len);
    if (NULL == dbg_mix) {
      fprintf(QB_ERR, "E: qb_malloc(%lld) for dbg_mix failed\n", full_len);
      return QB_E_MALLOC;
    }
  }
  
  printf("      Mixing pre-carrier (shift %d samples): ", phase_shift_smps);
  fflush(stdout);
  qb_show_meter(dp);
  
  // *****************
  // 1. SQUARE AND MIX
  // *****************
  
  // buf_carrier_clipped_out is temporarily populated with a piece of signal from the future.
  // once this has been through the filter, it will have been
  // delayed back to the present (and we also have the "time travel" thing)
  for (q=0; q < full_len; q++) {
  
    float raw, dbl, mix; //ps,
    s64_t i;
    
    // need to somehow convert 1200 Hz tones into 2400 Hz ones,
    // because we need an unbroken, consistent-phase 2400 Hz tone for
    // the PLL to lock onto.
    
    // various ways of doing this were tried: rectification (both
    // half-wave and full-wave), soft clipping, hard clipping. the
    // idea was to try to rough up the 1200 Hz tones and make them
    // angry enough to throw off a significant harmonic at 2400 Hz
    // for the PLL to lock onto.
    
    // then I had a better idea:
    
    // cos(2x) = 2.cos^2(x) - 1
    
    // so, square the signal, double it, subtract 1.0. this should
    // replace the 1200 Hz sections entirely with 2400 Hz ones.
    
    // of course, now the non-synthetic 2400 Hz cycles have also been
    // frequency-multiplied and therefore shifted up to 4800 Hz, so
    // in order to obtain a consistent (and hopefully well-formed
    // in phase) 2400 Hz signal throughout we will need to mix the
    // frequency-doubled signal with the original one.
    
    raw = buf_f[q]; // no phase shift
    
    // check the phase shift doesn't move us outside the buffer
    // note that we can now take input from slightly beyond the
    // end of the span, so the check is against full_len rather than span_len
    i = q + phase_shift_smps;
    
    if ((i>0) && (i < full_len)) {
    
      //ps = buf_f[i];
      //dbl = (((ps * ps) * 2.0f) - 1.0f);
      
      dbl = buf_squared_f[i]; // get phase-shifted, squared (freq-doubled) value
      
      //if ( (0 == phase_ix) && do_inspect) { // doubled is same for all phases, only do it once
      //  dbg_doubled[i] = qb_float_to_s16(dbl);
      //}
      
      mix = (0.48f * dbl) + (0.48f * raw);

    } else {
      mix = 0.0f; // shifted outside buffer, just use 0
    }
    
    if (do_inspect) {
      dbg_mix[q] = qb_float_to_s16(mix * QB_CARRIER_FILTERED_SCALER);
    }
    
    // we borrow buf_carrier_clipped_out, since it's
    // already usefully allocated, but
    // this will be updated later
    buf_carrier_clipped_out[q] = mix;
    
    qb_update_meter (dp, q, full_len, 1.0f, 0);
    
  }
  
  qb_hide_meter (dp, 0);
  printf("done.\n");

  if (do_inspect) {
  
    e_dbg = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_PLL_MIX + phase_ix,
                                       dbg_mix,
                                       full_len);
    if (QB_E_OK != e_dbg) {
      fprintf(QB_ERR, "E: Error writing PLL mix (doubled + raw) inspection file.\n");
      return e_dbg;
    }
    qb_free(dbg_mix);
    dbg_mix = NULL;
  
    dbg_carrier = qb_malloc (sizeof(s16_t) * full_len);
    if (NULL == dbg_carrier) {
      fprintf(QB_ERR, "E: Out of memory allocating debug PLL carrier buffer.\n");
      return QB_E_MALLOC;
    }
  } // endif (do_inspect)
  
//exit(0);
  
  printf("      Extracting carrier: ");
  fflush(stdout);
  qb_show_meter (dp);
  
  // *********
  // 2. FILTER
  // *********
  
  // again, reading beyond the end of the span is possible
  // thanks to the delay
  for (q=0, n = filter_delay;
       /*(q < full_len) &&*/ (n < full_len);
       q++, n++) {
  
    float flt, clp;
    
    // apply filter to mutated data to get rid of rank VLF component
    // and isolate the lock frequency
    
    // (filters are still double precision)
    qb_fir_in (bp_2k4_p, buf_carrier_clipped_out[n]); // n in, q out
    flt = qb_fir_out (bp_2k4_p);

    if (do_inspect) {
      dbg_carrier[q] = qb_float_to_s16(flt);
    }
    
    // apply the most aggressive clipping it's possible to do.
    // we do this to get insane levels of signal compression.
    // this will of course create all sorts of horrid harmonics,
    // but they don't trouble the lock frequency range, so
    // this is worth doing
    
    if (flt < 0.0f) {
      clp = -1.0f;
    } else {
      clp = 1.0f;
    }
    
    buf_carrier_clipped_out[q] = clp; // now the proper value is written
    
    if (dp) {
      qb_update_meter (dp, q, full_len - filter_delay, 1.0f, 0);
    }
    
  }
  
  qb_hide_meter (dp, 0);
  printf("done.\n");
  
  if (do_inspect) {
    e_dbg = qb_inspect_append_s16_buf (inspect->files + QB_INSPECT_FILE_IX_PLL_CARRIER + phase_ix,
                                       dbg_carrier,
                                       full_len - filter_delay);
    qb_free(dbg_carrier);
    dbg_carrier = NULL;
    if (QB_E_OK != e_dbg) {
      fprintf(QB_ERR, "E: Error writing PLL carrier inspection file.\n");
      return e_dbg;
    }
  }
  
  // trap errors
  return QB_E_OK;
  
}
#endif // no vectors




static qb_err_t pll_span_get_cycs (qb_pll_carrier_t *carrier,
                                   s64_t input_len,
                                   s64_t span_len,    // span length
                                   s64_t span_start_smps,
                                   u8_t reversed,
                                   s32_t sample_rate,
                                   qb_pll_t *pll,
                                   u8_t phase_ix,
                                   qb_atom_t **cycles_out, // or NULL
                                   s32_t *num_cycles_out,   // or NULL
                                   qb_inspect_t *inspect,
                                   float *lock_average_per_sample_out,
                                   u8_t dp) {

  s32_t cycs_alloc;
  qb_err_t e;
  s64_t n;
  
  // inspect buffers
  s16_t *dbg_pll_output, *dbg_integral, *dbg_lock;
  s16_t *dbg_quad_ref;
  
  //s32_t rate_int;
  qb_err_t ei[4] = {QB_E_OK, QB_E_OK, QB_E_OK, QB_E_OK};
  
  u8_t k;
  
  //double prev_pll_phase;
  float prev_pll_phase;
  
  //prev_pll_phase = 0.0;
  prev_pll_phase = 0.0f;
    
#ifdef QB_SANITY
  if (sample_rate < 100.0f) {
    fprintf(QB_ERR, "\nB: %s: nonsensical sample rate %d.\n", QB_FUNC_M, sample_rate);
    return QB_E_BUG;
  }
  
  // if processing a squawk backwards (from right to left),
  // confirm that it starts out locked properly (it should already
  // have a good lock on the subsequent leader span played backwards)
  if (reversed && (pll->pll_lock < 0.1)) {
    qb_hide_meter(dp, 1);
    fprintf(QB_ERR, "W: [%lld]: reversed PLL starts with rancid lock: %lf\n",
            span_start_smps + span_len, pll->pll_lock);
    // print this again:
    printf("    Extracting sync (PLL): ");
    fflush(stdout); // MacOS
    qb_show_meter(dp);
  }
#endif

  if (NULL == carrier->signal) {
    fprintf(QB_ERR, "\nB: %s: NULL carrier passed in.\n", QB_FUNC_M);
    return QB_E_BUG;
  }
  
  cycs_alloc = 0;
  if (NULL != num_cycles_out) {
    *num_cycles_out = 0;
  }
  if (NULL != cycles_out) {
    *cycles_out = NULL;
  }
  
  dbg_pll_output  = NULL;
  dbg_integral    = NULL;
  dbg_lock        = NULL;
  dbg_quad_ref    = NULL;
  
  do { // try {
  
    qb_inspect_file_t *fi;
    qb_atom_t *cycs_tmp; // cycles staging buffer, so we can reverse them
    s32_t an;
    double lock_accumulator_span;
    double lock_accumulator_cycle;
    
    cycs_tmp = NULL;
  
    if (inspect->enabled) {
    
      dbg_pll_output  = qb_malloc (sizeof(s16_t) * span_len);
      dbg_integral    = qb_malloc (sizeof(s16_t) * span_len);
      dbg_lock        = qb_malloc (sizeof(s16_t) * span_len);
      dbg_quad_ref    = qb_malloc (sizeof(s16_t) * span_len);
      
      if (    (NULL == dbg_pll_output)
           || (NULL == dbg_integral)
           || (NULL == dbg_lock)
           || (NULL == dbg_quad_ref) ) {
        fprintf(QB_ERR, "\nE: Out of memory writing debug files for PLL.\n");
        e = QB_E_MALLOC;
        break;
      }
      
      memset (dbg_pll_output,  0, sizeof(s16_t) * span_len);
      memset (dbg_integral,    0, sizeof(s16_t) * span_len);
      memset (dbg_lock,        0, sizeof(s16_t) * span_len);
      memset (dbg_quad_ref,    0, sizeof(s16_t) * span_len);
      
    }
    
    e = QB_E_OK;
    
    lock_accumulator_span  = 0.0;
    lock_accumulator_cycle = 0.0;
  
    for (n=0; n < span_len; n++) {
    
      qb_atom_t c;
      u8_t measure_power_this_sample;
      float pll_phase;
      s64_t m,q,h;
      double v;
      double phase_trigger_value;
      u8_t wrap;
      double pll_integral_debug_value;
      
      // delayed smpnum
      q = n; // + best_pll_delay;
      
      // this is where the actual reversal will be done (maybe, depends on 'reversed')
      if ( ! reversed ) {
        // h goes from low to high as usual
        h = q;
      } else {
        // reversed: h goes from high to low, instead of low to high
        h = span_len - (1 + q);
      }
      
      m = h + span_start_smps;
      
//if (reversed && (0==n)) {
//  printf("reversed: first backwards squawk smpnum is %lld\n", m);
//}
      
      if ((m < 0) || (m >= input_len)) {
        // delay moves us outside buffer, use zero
        v = 0.0;
      } else {
        v = carrier->signal[m];
      }
      
      pll_phase = qb_pll_process (pll, v);
      
      wrap = 0;
      phase_trigger_value = get_pll_phase_trigger_value_for_phase_ix (phase_ix, &wrap);
      
      if (n==0) {
        // on the very first cycle, we don't have any meaningful prior phase
        // record in prev_pll_phase yet, so it will always generate
        // a cycle. particulaly in the reverse squawk case, we really
        // do not want this, as it will generate a fake cycle at the *end* of the
        // squawk data span.
        measure_power_this_sample = 0;
      } else if (wrap) {
        // wraparound case (phase is 180)
        // want to sample when phase is 0, i.e. at the discontinuity
        measure_power_this_sample = (pll_phase < prev_pll_phase);
      } else {
        measure_power_this_sample =    (prev_pll_phase < phase_trigger_value)
                                    && (pll_phase >= phase_trigger_value);
      }
      
//printf("PTV = %lf, pll_phase = %lf, prev = %lf, sample = %u\n",
//       phase_trigger_value, pll_phase, prev_pll_phase, measure_power_this_sample);
                          
      // change h to disregard delay (it was a source delay into the carrier, not a destination delay)
      if ( ! reversed ) {
        h = n;
      } else {
        h = span_len - (1 + n);
      }
      
      if ( measure_power_this_sample ) {

        qb_init_atom(&c);

        // this will be the time at which we do Goertzel measurements later
        c.sample_num         = span_start_smps + h;
        
        // write measured PLL lock average to *previous* cycle
        // (this will be used later, when we decide which spans
        // we need to run PLL cycle insertion on)
        // -- actually, is this still true? don't think we do this any more, deprecated?
        if ((NULL != num_cycles_out) && (*num_cycles_out > 0)) {
          cycs_tmp[*num_cycles_out - 1].pll_lock = (float) (lock_accumulator_cycle / (double) (c.sample_num - cycs_tmp[*num_cycles_out - 1].sample_num));
        }
        
        //lock_accumulator_cycle = 0.0;
        lock_accumulator_cycle = 0.0f;
        
        if (NULL != cycles_out) {
          e = qb_append_atom (&c,
                                &cycs_tmp,
                                &cycs_alloc,
                                num_cycles_out);
          if (QB_E_OK != e) { break; }

        }
        
      }
      
      prev_pll_phase = pll_phase;

      if (inspect->enabled) {
      
        // remember, if reversed, h goes from high to low
        
        // we're just writing the information into the
        // inspect buffers in reverse order
        
        if ((h>=0) && (h<span_len)) { // needed??
        
          pll_integral_debug_value = pll->pll_integral * 8.0;
          // pre-clamp, to avoid possible "clamping" error spam
          // (PLL integral needs quite a bit of amplification)
          if (pll_integral_debug_value < -1.0) {
            dbg_integral[h] = -32767;
          } else if (pll_integral_debug_value > 1.0) {
            dbg_integral[h] = 32767;
          } else {
            dbg_integral[h] = qb_double_to_s16 (pll_integral_debug_value); // * 0.1);
          }
          //}
          dbg_pll_output[h] = qb_double_to_s16 (pll->ref_sig);

          // HACK: clamp to prevent abs(lock) > 1.0
          if (fabs(pll->pll_lock) > 1.0) {
          //if (fabsf(pll->pll_lock) > 1.0f) {
            //pll->pll_lock = (pll->pll_lock >= 0.0) ? 1.0 : -1.0;
            pll->pll_lock = (pll->pll_lock >= 0.0f) ? 1.0f : -1.0f;
          }
          
//if ( ! reversed ) {
          dbg_lock[h]     = qb_double_to_s16(pll->pll_lock);
//}
          dbg_quad_ref[h] = qb_double_to_s16(pll->debug_quad_ref * 0.3);
          //dbg_lock[h]     = qb_float_to_s16(pll->pll_lock);
          //dbg_quad_ref[h] = qb_float_to_s16(pll->debug_quad_ref * 0.3f);
        }
        
      }
      
      lock_accumulator_span  += pll->pll_lock;
      lock_accumulator_cycle += pll->pll_lock;
      
      if (QB_E_OK != e) { break; }

    } // next sample
    
    if (QB_E_OK != e) {
      if (NULL != cycs_tmp) { qb_free(cycs_tmp); }
      cycs_tmp = NULL;
      break;
    }
    
    //lock_accumulator_span /= (double) span_len;
    lock_accumulator_span /= (float) span_len;
    
    *lock_average_per_sample_out = (float) lock_accumulator_span;
    

//      printf("span %d (start %lld): average lock per sample = %lf\n",
//             span_ix, span_start_smps, lock_accumulator);
    
    // copy tmp cycles buffer to destination
    // (reverse if necessary)
    if (cycles_out != NULL) {
  
      (*cycles_out) = qb_malloc (sizeof(qb_atom_t) * (*num_cycles_out));
      if (NULL == *cycles_out) {
        fprintf(QB_ERR, "\nE: Out of memory allocating cycles buffer.\n");
        e = QB_E_MALLOC;
        if (NULL != cycs_tmp) { qb_free(cycs_tmp); }
        cycs_tmp = NULL;
        break;
      }
    
      for (an=0; an < *num_cycles_out; an++) {
        s32_t an_real;
        if ( reversed ) {
          an_real = *num_cycles_out - (1 + an);
        } else {
          an_real = an;
        }
        (*cycles_out)[an_real] = cycs_tmp[an];
      }
      
      if (QB_E_OK != e) {
        *num_cycles_out = 0;
      }
      
      if (NULL != cycs_tmp) { qb_free(cycs_tmp); }
      cycs_tmp = NULL;
      
    }
    
    if (QB_E_OK != e) { break; }

    if (inspect->enabled) {
    
      //rate_int = sample_rate; // ??
      
      fi = inspect->files;
      
      ei[0] = qb_inspect_append_s16_buf (fi + QB_INSPECT_FILE_IX_PLL_OUTPUT,   dbg_pll_output, span_len);
      ei[1] = qb_inspect_append_s16_buf (fi + QB_INSPECT_FILE_IX_PLL_INTEGRAL, dbg_integral,   span_len);
      ei[2] = qb_inspect_append_s16_buf (fi + QB_INSPECT_FILE_IX_PLL_LOCK,     dbg_lock,       span_len);
      ei[3] = qb_inspect_append_s16_buf (fi + QB_INSPECT_FILE_IX_PLL_QUAD_REF, dbg_quad_ref,   span_len);

    }
    
  } while (0); // finally {

  if (NULL != dbg_pll_output)  { qb_free(dbg_pll_output); }
  if (NULL != dbg_integral)    { qb_free(dbg_integral);   }
  if (NULL != dbg_lock)        { qb_free(dbg_lock);       }
  if (NULL != dbg_quad_ref)    { qb_free(dbg_quad_ref);   }
  
  if (QB_E_OK != e) { return e; }
  
  for (k=0; k < 4; k++) {
    if (QB_E_OK != ei[k]) { return ei[k]; }
  }
  
  return e;
  
}
