Multi-Method Audio Transcriber

// Name: Multi-Method Audio Transcriber
// Description: Transcribe audio/video via local Whisper or Deepgram with auto ffmpeg extraction, batch, and save/copy options.
// Author: dallascrilley
// GitHub: dallascrilley

import "@johnlindquist/kit"

type Method = "auto" | "whisper" | "deepgram"

const audioExts = new Set([
  "mp3","wav","m4a","flac","aac","ogg","oga","opus","wma","aiff","aif","alac","amr","midi","mid","caf","ra","rm","mp2","mka","mp1","aifc","pcm","wavpack","wv","ape"
])
const videoExts = new Set([
  "mp4","mov","avi","mkv","webm","wmv","flv","m4v","mpg","mpeg","3gp","3g2","mts","m2ts","ts","vob","ogv","mxf","rmvb","divx","asf","f4v","mpe"
])

const ensureFfmpeg = () => {
  if (!which("ffmpeg")) {
    throw new Error("ffmpeg not found. Please install ffmpeg and ensure it's on your PATH.")
  }
}

const chooseMethod = async (): Promise<Method> => {
  const choice = await arg("Choose transcription method", [
    { name: "Auto (Prefer Local Whisper, fallback to Deepgram)", value: "auto" },
    { name: "Whisper (Local)", value: "whisper" },
    { name: "Deepgram (API)", value: "deepgram" },
  ])
  return choice as Method
}

const getExt = (p: string) => path.extname(p).toLowerCase().replace(".", "")
const isAudio = (p: string) => audioExts.has(getExt(p))
const isVideo = (p: string) => videoExts.has(getExt(p))

const toMp3_128 = async (inputPath: string) => {
  ensureFfmpeg()
  const out = tmpPath(`${path.basename(inputPath, path.extname(inputPath))}-${uuid().slice(0, 8)}.mp3`)
  const cmd = `ffmpeg -y -i ${JSON.stringify(inputPath)} -vn -acodec libmp3lame -b:a 128k ${JSON.stringify(out)}`
  await exec(cmd)
  return out
}

const deepgramTranscribe = async (mp3Path: string, apiKey: string) => {
  const url = `https://api.deepgram.com/v1/listen?model=nova-3&smart_format=true`
  const data = await readFile(mp3Path)
  const { data: resp } = await post(url, data, {
    headers: {
      Authorization: `Token ${apiKey}`,
      "Content-Type": "audio/mp3",
    },
    maxBodyLength: Infinity,
  })
  const transcript =
    resp?.results?.channels?.[0]?.alternatives?.[0]?.transcript ||
    resp?.results?.utterances?.map((u: any) => u?.transcript).filter(Boolean).join("\n") ||
    ""
  return String(transcript || "").trim()
}

const whisperTranscribe = async (mp3Path: string, model: string) => {
  const whisperBin = which("whisper")
  if (!whisperBin) throw new Error("Local 'whisper' CLI not found.")
  const outDir = tmpPath(`whisper-${uuid().slice(0, 8)}`)
  await ensureDir(outDir)
  const cmd = `${JSON.stringify(whisperBin)} ${JSON.stringify(mp3Path)} --model ${JSON.stringify(model)} --output_format txt --output_dir ${JSON.stringify(outDir)}`
  await exec(cmd)
  // Find a generated .txt for this file
  const base = path.basename(mp3Path)
  const pattern = path.join(outDir, `${base}*.txt`)
  const matches = await globby(pattern)
  let txtPath = matches[0]
  if (!txtPath) {
    // Fallback: any txt in outDir
    const anyTxt = await globby(path.join(outDir, `*.txt`))
    txtPath = anyTxt[0]
  }
  if (!txtPath) throw new Error("Whisper output .txt not found.")
  const txt = await readFile(txtPath, "utf8")
  return txt.trim()
}

const pickFiles = async (): Promise<string[]> => {
  const infos = await drop({
    placeholder: "Drop audio/video files to transcribe",
    enter: "Use Dropped Files",
  })
  if (typeof infos === "string") {
    await div(md(`Please drop files, not text.`))
    exit()
  }
  const paths = infos?.map((i: any) => i?.path).filter(Boolean) || []
  if (!paths.length) {
    await div(md(`No files dropped.`))
    exit()
  }
  return paths
}

const pickWhisperModel = async () => {
  const m = await env("WHISPER_MODEL", {
    placeholder: "Whisper model name (e.g., base, small, medium, large-v3)",
    hint: "Press enter for default 'base'",
  })
  return m?.trim() || "base"
}

const ensureDeepgramKey = async () => {
  const key = await env("DEEPGRAM_API_KEY", {
    secret: true,
    placeholder: "Enter your Deepgram API Key",
    hint: "Required for Deepgram transcription",
  })
  return key.trim()
}

const preferWhisper = async () => Boolean(which("whisper"))

const resolveMethod = async (method: Method) => {
  if (method === "whisper") {
    if (!(await preferWhisper())) throw new Error("Local Whisper not available.")
    return "whisper"
  }
  if (method === "deepgram") return "deepgram"
  // auto
  if (await preferWhisper()) return "whisper"
  return "deepgram"
}

const updateProgress = async (current: number, total: number, fileName: string) => {
  await setPanel(
    md(
      `### Transcribing ${current}/${total}
- File: ${path.basename(fileName)}
- Please wait...`
    )
  )
}

const summarizeResults = (results: { file: string; ok: boolean; error?: string }[]) => {
  const ok = results.filter(r => r.ok).length
  const fail = results.length - ok
  return md(`### Done
- Success: ${ok}
- Failed: ${fail}`)
}

const main = async () => {
  const inputPaths = await pickFiles()
  const methodChoice = await chooseMethod()
  let methodResolved: Method
  try {
    methodResolved = (await resolveMethod(methodChoice)) as Method
  } catch (e: any) {
    await div(md(`Error resolving method: ${e?.message || e}`))
    exit()
    return
  }

  let whisperModel = "base"
  let deepgramKey = ""

  if (methodResolved === "whisper") {
    whisperModel = await pickWhisperModel()
  } else {
    deepgramKey = await ensureDeepgramKey()
  }

  const transcripts: Record<string, string> = {}
  const results: { file: string; ok: boolean; error?: string }[] = []

  await setPanel(md(`Preparing files...`))

  for (let i = 0; i < inputPaths.length; i++) {
    const filePath = inputPaths[i]
    try {
      if (!(isAudio(filePath) || isVideo(filePath))) {
        throw new Error(`Unsupported format: ${path.extname(filePath) || "unknown"}`)
      }

      await updateProgress(i + 1, inputPaths.length, filePath)

      // Always convert/extract to 128kbps mp3 for compatibility
      const mp3 = await toMp3_128(filePath)

      let text = ""
      if (methodResolved === "whisper") {
        text = await whisperTranscribe(mp3, whisperModel)
      } else {
        text = await deepgramTranscribe(mp3, deepgramKey)
      }

      transcripts[filePath] = text
      results.push({ file: filePath, ok: true })
      await remove(mp3).catch(() => {})
    } catch (err: any) {
      results.push({ file: filePath, ok: false, error: err?.message || String(err) })
    }
  }

  await setPanel(summarizeResults(results))

  const combined = Object.entries(transcripts)
    .map(([file, text]) => `# ${path.basename(file)}\n\n${text}\n`)
    .join("\n")

  if (combined.trim().length) {
    await copy(combined)
    await notify(`Transcripts copied to clipboard`)
  }

  // Offer to save individual transcript files
  const saveChoice = await arg("Save transcripts as .txt files?", ["Yes", "No"])
  if (saveChoice === "Yes") {
    const folder = await selectFolder("Select folder to save transcripts")
    for (const [file, text] of Object.entries(transcripts)) {
      const base = path.basename(file, path.extname(file))
      const out = path.join(folder, `${base}.txt`)
      await writeFile(out, text, "utf8")
    }
    await notify(`Saved ${Object.keys(transcripts).length} transcript(s)`)
    await revealInFinder(folder)
  }

  // Show a quick summary and allow viewing combined
  const view = await arg(
    {
      placeholder: "View results?",
      enter: "Open summary",
      strict: true,
    },
    [
      { name: "Open Combined Transcript", value: "open" },
      { name: "Close", value: "close" },
    ]
  )

  if (view === "open") {
    await editor(combined || "No transcripts")
  }
}

try {
  await main()
} catch (error: any) {
  await div(
    md(
      `## Error
${error?.message || String(error)}`
    )
  )
}