use crate::proc::{Procer, UpDequer, DequerUtils, Update};
use crate::notes::{Note, NoteValue, ProcerNote};
use core::ops::Range;
//use std::collections::VecDeque;
use std::sync::Arc;

use realfft::{RealFftPlanner, FftNum, RealToComplex};
use rustfft::num_complex::Complex;
//use rustfft::num_traits::Zero;
use rustfft::num_traits::One;
use rustfft::num_traits::float::Float;
use derive_more::Display;
use std::fmt::Debug;


trait DebugableRTC<I>: RealToComplex<I> + Debug {}


// the Arc<dyn> here really increases compile times, but that might just be because it has to link
// in realfft/rustfft where before it didn't get linked in

#[derive(Display)]
#[display(fmt="RfftProcer(size={}, outsize={}, freq={}, deque=(size={}, cap={}), in_buf_size={}, scratch=(size={}, cap={}) out_data=(size={}, cap={}))", size, outsize, frequency, "in_deque.len()", "in_deque.capacity()", "in_buf.len()", "scratch.len()", "scratch.capacity()", "out_data.len()", "out_data.capacity()")]
pub struct RFftProcer<I: Clone + FftNum, const US: usize, const BS: usize> {
    pub size: usize,
    pub outsize: usize,
    pub frequency: usize,
    pub in_deque: UpDequer<I, US>,
    pub in_buf: [I; BS],
    pub rfft: Arc<dyn RealToComplex<I>>,
    // TODO: could make this a fixed array instead of a vec too maybe
    pub scratch: Vec<Complex<I>>,
    pub out_data: Vec<Complex<I>>,
    pub scale: I,
    //pub in_deque: VecDeque<Update<A>>,
}

impl<I: Default + Copy + FftNum + Float, const US: usize, const BS: usize> RFftProcer<I, US, BS> {
    pub fn new(frequency: usize) -> Self {
        let retsize = BS/2+1;
        // TODO: could do some cleverness with using double then you'd only need to pop elements
        // off after you're double-full, but i think just having a buffer of 1 or 2 elements is
        // easier here and popping before each add in process_data if we're full
        let dq_capacity = (BS/US)+2;
        assert!((dq_capacity * US) > BS);
        let mut planner = RealFftPlanner::new();
        let rfft = planner.plan_fft_forward(BS);
        let scratch = rfft.make_scratch_vec();
        let out_data = rfft.make_output_vec();
        return Self {
            size: BS,
            outsize: retsize,
            frequency: frequency,
            in_deque: UpDequer::with_capacity(dq_capacity),
            in_buf: [I::default(); BS],
            rfft: rfft,
            //out_data: Vec::with_capacity(retsize),
            scratch: scratch,
            out_data: out_data,
            // the docs say to scale by 1/sqrt(len), but it seems like 1/len (where len is the real
            // input size) is the correct scaling to get results with norm_sqrt results between 0-1
            // I'm still not entirely sure about this though, l1_norm also seems to max out at len
            // without scaling so 1/len i think is right for it too, but it also gives lower
            // results when there's an imaginary part
            //scale: /*I::from_usize(10).unwrap()*/I::one() / I::from_usize(retsize/*-1*/).unwrap().sqrt(),
            scale: I::one() / I::from_usize(BS).unwrap(),
        }
    }
}

impl<I: Default + Clone + FftNum + Float + NoteValue, const US: usize, const BS: usize> Procer<I, US> for RFftProcer<I, US, BS> {
    fn get_size(&self) -> usize {
        self.size
    }
    fn get_frequency(&self) -> usize {
        self.frequency
    }

    fn make_pnotes<'nl>(&self, notes: &'nl [Note]) -> Vec<ProcerNote<'nl, I>> {
        // TODO: +1 or nah? (might have to handle it differently for even or odd, hopefully not
        // though)
        //let retsize = self.size/2+1;
        let retsize = self.outsize;
        let noteslen = notes.len();
        let mut ret: Vec<ProcerNote<'nl, I>> = Vec::with_capacity(noteslen);
        let mut cur_note_idx = 0;
        let mut cutoff = notes[0].high_freq;
        let scale: f64 = (self.frequency as f64)/(self.size as f64);
        // TODO: since we do have .low_freq here as well you can start the low size of the range
        // close to .low_freq (how close?) but for now we'll start it at 1
        let mut cur_range: Range<usize> = 1..1;
        for i in 1..retsize {
            let freq: f64 = (i as f64) * scale;
            if freq > cutoff {
                while freq > cutoff {
                    let pnote = ProcerNote::<'nl, I>(&notes[cur_note_idx], cur_range.clone(), I::default(), I::default());
                    ret.push(pnote);
                    cur_range.start = cur_range.end;
                    // worth noting this is different than the python code,
                    // here frequencies above the high note aren't added in
                    // where the python code adds them all to the highest
                    // note, you change that by changing the return below
                    // here to a cutoff = <however you express infinite> i
                    // think
                    cur_note_idx += 1;
                    if cur_note_idx >= noteslen {
                        return ret;
                    }
                    cutoff = notes[cur_note_idx].high_freq;
                }
            }
            //else {
            cur_range.end += 1;
            //}
        }
        // add the final pnote if we ran out of frequencies before running
        // out of notes
        ret.push(ProcerNote::<'nl, I>(&notes[cur_note_idx], cur_range, I::default(), I::default()));
        return ret;
    }

    fn process_data(&mut self, input: &Update<I, US>, notes: &mut [ProcerNote<I>]) -> bool {
        let dq_len = self.in_deque.len();
        let dq_cap = self.in_deque.capacity();
        // we should probably make sure buf_size can be made with the length of dq as well though
        // so capacity can grow if new() messed up with the capacity
        if dq_len == dq_cap {
            self.in_deque.pop_front();
        }
        let scale = self.scale;
        // XXX: mm this copies twice per update (once to add it to the deque and once to the buffer)
        // which i kinda wanted to avoid (using an Arc would avoid it but it'd be extra heap
        // allocations) atleast it doesn't realloc though so hopefully that's ok
        // depending on how this works with Alsa i might wanna switch to passing in a slice and
        // storing references in the UpDeque but that makes lifetime soup
        // or something else maybes
        // OHH though since i'm probably getting i16 from alsa and want to convert to f32 anyway it
        // makes sense to copy (question is will that add a third copy or will the compiler be
        // smart enough to convert inplace, i think ideally the conversion would happen here in
        // this push_back actually, maybe the input Update should be AI and this push_back should
        // be as I or Into or whatever)
        self.in_deque.push_back(input.clone());
        let updated = self.in_deque.update_buffer_array(&mut self.in_buf);
        //println!("updated = {}", updated);
        if updated {
            //assert_eq!(self.in_buf, [I::default(); BS]);
            let post_scale = I::from_u8(2).unwrap().sqrt();
            self.rfft.process_with_scratch(&mut self.in_buf, &mut self.out_data, &mut self.scratch).unwrap();
            for pnote in notes {
                // TODO: figure out the best calculation for this
                // TODO: map/sum or product might be quicker then fold here, since fold can't
                // really use simd, also look at what the perftest code did to use simd
                // XXX: python's abs uses the equiv of sqrt(norm_sqr) (https://docs.rs/num-complex/0.4.3/num_complex/struct.Complex.html#method.norm_sqr)
                // ie. sqrt(|real|**2 + |imag|**2), gonna just use norm_sqrt here for now
                // l1_norm might also be a good option (just |real| + |imag|), and gets rid of the
                // pow too, but i dunno what's best here
                // TODO: the python test2.py code actually currently uses max instead of folding here too
                //let complexes = self.out_data[pnote.1.clone()].iter().fold((I::default(), Complex::<I>::default()), |(sum, mx), c| {let c_sqrt = c.scale(scale).norm_sqr().sqrt(); (sum + c_sqrt, std::cmp::max_by(mx, *c, |a: &Complex<I>, b: &Complex<I>| { a.norm_sqr().partial_cmp(&b.norm_sqr()).unwrap() }))});
                let complexes = self.out_data[pnote.1.clone()].iter().fold((I::default(), Complex::<I>::default(), I::default()), |(sum, mx_c, mx_sqrt), c| {
                    let c_sqr = c.scale(scale).norm_sqr();
                    let c_sqrt = c_sqr.sqrt();
                    if c_sqrt > mx_sqrt {
                        (sum + c_sqr, *c, c_sqrt)
                    } else {
                        (sum + c_sqr, mx_c, mx_sqrt)
                    }
                });
                pnote.2 = complexes.0.sqrt() * post_scale; // / I::from_usize(pnote.1.len()).unwrap();
                //pnote.3 = complexes.1.scale(scale).norm_sqr() * I::from_f32(2.0).unwrap();
                //pnote.2 = complexes.0.scale(scale).norm_sqr().sqrt();
                //pnote.3 = complexes.1.scale(scale).norm_sqr().sqrt();
                pnote.3 = complexes.2 * post_scale;
                //pnote.2 = self.out_data[pnote.1.clone()].iter().fold(I::default(), |acc, c| acc + (c.norm_sqr().sqrt()*scale))/*.sqrt()*/;
                //pnote.2 = self.out_data[pnote.1.clone()].iter().map(|c| c.norm_sqr()).max_by(|a, b| { a.partial_cmp(b).unwrap() } );
            }
        }
        return updated;
    }
}