import axios from 'axios';
import { Language, OnTranscriptEvent, RecordingState, SpeechToText } from '../speech-to-text';

export type WhisperSpeechToTextConfig = {
  onTranscript: OnTranscriptEvent;
  whisperEndpointUrl: string;
  minDecibels: number;
  silenceThreshold: number;
  silenceDuration: number;
  shortSentenceLength: number;
  shortSentenceSilenceFactor: number;
  timeSlice: number;
  language?: Language;
  task?: string;
  wordTimestamps?: boolean;
  initialPrompt?: string;
  encodeAudio?: boolean;
  outputFormat?: string;
  deviceId?: string;
};

type WhisperContext = {
  config: WhisperSpeechToTextConfig;
  chunks: Blob[];
  recordingState: RecordingState;
  transcribing: boolean;
  transcript: string;
  mediaRecorder: MediaRecorder;
  mediaStream: MediaStream;
  startRecordingTime?: number;
};

const AUDIO_TYPE = 'mp4';
const AUDIO_CODECS = 'aac';
const AUDIO_SAMPLE_RATE = 16000;
const AUDIO_MIME_TYPE = `audio/${AUDIO_TYPE}`;
const AUDIO_MIME_TYPE_WITH_CODECS = `${AUDIO_MIME_TYPE};codecs=${AUDIO_CODECS}`;
const ERROR_MEDIA_RECORDER_UNAVAILABLE = 'The media recorder is unavailable. Please check your browser settings.';

const countWords = (s: string): number => s.split(/\s+/).length;

const encodeURLParameter = (urlParam: string, value: string | boolean | number | undefined | null): string | undefined => {
  return (value !== null && value !== undefined) ? `${urlParam}=${encodeURIComponent(value)}` : undefined;
};

const sendToWhisper = async (wc: WhisperContext, file: File): Promise<any> => {
  const body = new FormData();
  body.append('audio_file', file);

  // Build URL
  const parameters = [
    encodeURLParameter('task', wc.config.task),
    encodeURLParameter('language', wc.config.language),
    encodeURLParameter('initial_prompt', wc.config.initialPrompt || ' '),
    encodeURLParameter('encode', wc.config.encodeAudio),
    encodeURLParameter('vad_filter', true),
    encodeURLParameter('word_timestamps', false),
    encodeURLParameter('output', 'json'),
  ].filter(param => param !== undefined);
  const query = (parameters.length > 0) ? '?' + parameters.join('&') : '';

  // Submit to Whisper
  return axios.post(`${wc.config.whisperEndpointUrl}${query}`, body, {
    headers: {
      'Content-Type': 'multipart/form-data',
    },
  }).then((response) => {
    return response.data
  });
};

// Reset the media stream and silence detection.
const resetMediaStream = (wc: WhisperContext): void => {
  wc.mediaRecorder.ondataavailable = null;
//  wc.mediaStream.getTracks().forEach(track => track.stop());
  wc.transcript = '';
  wc.chunks.length = 0; // Reset chunks.
  wc.transcribing = false;
  wc.startRecordingTime = undefined;
};

const onDataAvailable = (wc: WhisperContext, chunk?: BlobEvent): void => {

  // Capture chunk even if we can't process at this time.
  if (chunk?.data && chunk.data.size > 0) {
    if (wc.startRecordingTime === undefined) {
      wc.startRecordingTime = Date.now();
    }
    wc.chunks.push(chunk.data);
  }

  if (wc.recordingState === RecordingState.RECORDING) {

    if (wc.chunks.length > 0) {
      if (!wc.transcribing) {
        wc.transcribing = true;
        // Build audio file to submit.
        const file = new File([new Blob(wc.chunks, {
            type: AUDIO_MIME_TYPE_WITH_CODECS,
          })],
          `speech.${AUDIO_TYPE}`, {
            type: AUDIO_MIME_TYPE_WITH_CODECS,
          },
        );

        const whenDispatchedTime = Date.now();
        sendToWhisper(wc, file).then((data: any) => {
          if (!data) return; // Bad response.
          wc.transcript = data.text;
          wc.transcribing = false;
          const segments = data.segments;

          // If the last segment is within silenceDuration of the elapsed time.
          if (wc.startRecordingTime !== undefined && segments && segments.length > 0) {
            const duration = whenDispatchedTime - wc.startRecordingTime!;
            const lastSegment = segments[segments.length - 1];
            const lastTime = (lastSegment.end) ? lastSegment.end * 1000 : lastSegment[3] * 1000;
            const silenceFactor = (countWords(wc.transcript) <= wc.config.shortSentenceLength) ? wc.config.shortSentenceSilenceFactor : 1;
            wc.config.onTranscript(wc.transcript, ((lastTime + (silenceFactor * wc.config.silenceDuration)) < duration));
          } else {
            wc.config.onTranscript(wc.transcript, false);
          }
        });
      }
    }
  }
};

const getMediaStream = (deviceId?: string): Promise<MediaStream> => {
  return navigator.mediaDevices.getUserMedia({
    audio: deviceId ? {
      deviceId: {
        exact: deviceId,
      },
      sampleRate: AUDIO_SAMPLE_RATE,
    } : true,
  });
};

// Start a new whisper recording and transcription.
// A context is provided giving access to the audio setting.s
const startWhisper = (config: WhisperSpeechToTextConfig): Promise<WhisperContext> => {
  return new Promise(async (resolve, reject) => {
    try {
      const mediaStream = await getMediaStream(config.deviceId);
      const mediaRecorder = new MediaRecorder(mediaStream, {
        mimeType: AUDIO_MIME_TYPE,
        audioBitsPerSecond: AUDIO_SAMPLE_RATE
      });
      // Wait on the start event to trigger before we resolve the promise.
      mediaRecorder.onstart = () => {
        const wc = {
          chunks: [],
          recordingState: RecordingState.RECORDING,
          transcribing: false,
          transcript: '',
          mediaRecorder: mediaRecorder,
          mediaStream: mediaStream,
          config,
          startRecordingTime: undefined
        };

        mediaRecorder.ondataavailable = (chunk) => onDataAvailable(wc, chunk);

        mediaRecorder.onstart = null;
        resolve(wc);
      };

      mediaRecorder.start(config.timeSlice);

    } catch (e) {
      reject(e);
    }
  });
};

// Stop a whisper recording and transcription.
// The events are managed to ensure that the recording is stopped and the silence detection is stopped
// before a promise is resolved.
const stopWhisper = (wc: WhisperContext): Promise<void> => {
  return new Promise((resolve, reject) => {
    try {
      wc.mediaRecorder.onstop = () => {
        resetMediaStream(wc);
        wc.mediaStream.getTracks().forEach(track => track.stop());
        wc.mediaRecorder.onstop = null;

        resolve();
      };

      switch (wc.recordingState) {
        case RecordingState.UNAVAILABLE:
        case RecordingState.RECORDING:
          wc.recordingState = RecordingState.NOT_RECORDING;
          wc.mediaRecorder.stop();
          break;

        default:
          resolve();
        }

    } catch (e) {
      reject(e);
    }
  });
};

// Pause the whisper recording. This will involve resetting the media stream and stopping the silence detection.
// The promise is resolved only when the onpause event is detected.
const pauseWhisper = (wc: WhisperContext): Promise<void> => {
  return new Promise((resolve, reject) => {
    try {
      wc.mediaRecorder.onstop = () => {
        wc.recordingState = RecordingState.PAUSED;
        wc.mediaRecorder.onstop = null;

        resetMediaStream(wc);
        resolve();
      };

      wc.recordingState = RecordingState.PAUSED;
      wc.mediaRecorder.stop();
    } catch (e) {
      reject(e);
    }
  });
};

// Resume a paused whisper recording. This will involve setting the ondataavailable event and starting the silence detection.
const resumeWhisper = (wc: WhisperContext): Promise<void> => {
  return new Promise(async (resolve, reject) => {
    try {
      // Wait on the start event to trigger before we resolve the promise.
      wc.mediaRecorder.onstart = () => {
        wc.recordingState = RecordingState.RECORDING;
        wc.mediaRecorder.ondataavailable = (chunk) => onDataAvailable(wc, chunk);

        wc.mediaRecorder.onstart = null;

        resolve();
      };

      resetMediaStream(wc);
      wc.mediaRecorder.start(wc.config.timeSlice);

    } catch (e) {
      reject(e);
    }
  });

};

/**
 * A Whisper implementation of speech to text transcription.
 *
 * This includes real-time updates of transcription and silence detection.
 *
 * The controller methods in this class manage the state of the recording and transcription,
 * the functions above are the methods that actually control the interaction.
 *
 * @author Jason Waring
 */
export class WhisperSpeechToText implements SpeechToText {
  private config: WhisperSpeechToTextConfig;
  private context?: WhisperContext;

  constructor(config: WhisperSpeechToTextConfig) {
    this.config = config;
  }

  async start(): Promise<void> {
    try {
      // Stop any existing recording.
      if (this.context) {
        await stopWhisper(this.context).then(() => {
          this.context = undefined;
        });
      }

      // Start Whisper
      return startWhisper(this.config).then((wc) => {
        this.context = wc;
      });
    } catch (e: any) {
      return Promise.reject(e.message);
    }
  }

  async pause(): Promise<void> {
    const wc = this.context;
    switch (wc?.recordingState || RecordingState.NOT_RECORDING) {
      case RecordingState.NOT_RECORDING:
      case RecordingState.PAUSED:
        return Promise.resolve();

      case RecordingState.RECORDING:
        return (wc) ? pauseWhisper(wc) : Promise.reject(ERROR_MEDIA_RECORDER_UNAVAILABLE);

      case RecordingState.UNAVAILABLE:
      default:
        return Promise.reject(ERROR_MEDIA_RECORDER_UNAVAILABLE);
    }
  }

  async resume(): Promise<void> {
    try {
      const wc = this.context;
      switch (wc?.recordingState || RecordingState.NOT_RECORDING) {
          case RecordingState.NOT_RECORDING:
          case RecordingState.PAUSED:
            return (wc) ? resumeWhisper(wc) : Promise.reject(ERROR_MEDIA_RECORDER_UNAVAILABLE);

          case RecordingState.RECORDING:
            return Promise.resolve();

          case RecordingState.UNAVAILABLE:
          default:
            return Promise.reject(ERROR_MEDIA_RECORDER_UNAVAILABLE);
        }
    } catch (e: any) {
      return Promise.reject(e.message);
    }
  }

  async stop(): Promise<void> {
    try {
      const wc = this.context;
      if (wc) {
        return stopWhisper(wc).then(() => {
          this.context = undefined;
        });
      }
    } catch (e: any) {
      return Promise.reject(e.message);
    }
  }

  getState(): Promise<RecordingState> {
    return Promise.resolve(this.context?.recordingState || RecordingState.UNAVAILABLE);
  }

  onTranscript(transcriptionHandler: OnTranscriptEvent) {
    this.config.onTranscript = transcriptionHandler;
  }
}
