add tesseract ocr

2026-04-13 19:40:25 -04:00
parent 1350a6f94b
commit 9b2690f639
7 changed files with 299 additions and 9 deletions
--- a/src/lib/ai-tagger.ts
+++ b/src/lib/ai-tagger.ts
@@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types'
 import { getDb } from './db'
 import { getAiConfig, getEffectiveAiConfig, getPreferredLanguage } from './app-settings'
 import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
-import { getAiImagePath, getVideoFramePaths } from './thumbnails'
+import { getAiImagePath, getOcrImagePath, getVideoFramePaths } from './thumbnails'
 import { findFile } from './media-utils'
 import { getLibrary, resolveLibraryRoot } from './libraries'

@@ -509,7 +509,31 @@ export async function generateItemDescription(itemKey: string): Promise<string>
 // ─── Text extraction ─────────────────────────────────────────────────────────

 /**
- * Extract text (OCR) from an image using the vision model.
+ * Run Tesseract OCR on a preprocessed image file.
+ * Returns the extracted text and a mean confidence score (0–100).
+ * A confidence of 0 with empty text means no recognisable text was found.
+ */
+async function extractWithTesseract(
+  imagePath: string,
+  languages: string,
+): Promise<{ text: string; confidence: number }> {
+  const { createWorker } = await import('tesseract.js')
+  const workerPath = path.join(process.cwd(), 'node_modules/tesseract.js/src/worker-script/node/index.js')
+  const worker = await createWorker(languages, 1, { workerPath })
+  try {
+    const { data } = await worker.recognize(imagePath)
+    return { text: data.text.trim(), confidence: data.confidence }
+  } finally {
+    await worker.terminate()
+  }
+}
+
+/**
+ * Extract text (OCR) from an image using the configured OCR mode:
+ *  - hybrid:    try Tesseract first; fall back to LLM if confidence is below threshold
+ *  - tesseract: local Tesseract only, no LLM call
+ *  - llm:       LLM vision API only (original behaviour)
+ *
 * Only works for images in mixed libraries.
 * Translation is not performed automatically — call translateItemText() separately.
 * Returns { extractedText, translatedText } where translatedText is always null.
@@ -517,10 +541,6 @@ export async function generateItemDescription(itemKey: string): Promise<string>
 export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> {
  const libraryId = itemKey.split(':')[0]
  const config = getEffectiveAiConfig(libraryId)
-  const extractModel = config.modelExtract || config.model
-  if (!config.endpoint || !extractModel) {
-    throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
-  }

  const db = getDb()
  const item = db
@@ -547,12 +567,39 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
    throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' })
  }

+  const { ocrMode, ocrLanguages, ocrConfidenceThreshold } = config
+
+  // ── Tesseract path ────────────────────────────────────────────────────────
+  if (ocrMode === 'tesseract' || ocrMode === 'hybrid') {
+    const ocrImagePath = await getOcrImagePath(resolvedMedia.path, libraryId)
+    const { text, confidence } = await extractWithTesseract(ocrImagePath, ocrLanguages)
+
+    const useTesseractResult = ocrMode === 'tesseract' || confidence >= ocrConfidenceThreshold
+    if (useTesseractResult) {
+      console.log(`[ocr] tesseract used for ${itemKey} (confidence=${confidence}, mode=${ocrMode})`)
+      if (!text) {
+        db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey)
+        return { extractedText: '', translatedText: null }
+      }
+      db.prepare('UPDATE media_items SET extracted_text = ?, extracted_text_translated = NULL WHERE item_key = ?').run(text, itemKey)
+      return { extractedText: text, translatedText: null }
+    }
+    console.log(`[ocr] tesseract confidence too low (${confidence} < ${ocrConfidenceThreshold}), falling back to LLM for ${itemKey}`)
+  }
+
+  // ── LLM vision path ───────────────────────────────────────────────────────
+  const extractModel = config.modelExtract || config.model
+  if (!config.endpoint || !extractModel) {
+    throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
+  }
+
  const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
  const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]

  const customInstruction = config.promptExtract ? ' ' + config.promptExtract : ''
  const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]`

+  console.log(`[ocr] llm used for ${itemKey} (mode=${ocrMode})`)
  const extractedText = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt, config.maxTokensExtract)

  if (!extractedText || extractedText === '[NO TEXT]') {
--- a/src/lib/app-settings.ts
+++ b/src/lib/app-settings.ts
@@ -46,6 +46,8 @@ const DEFAULT_PROMPT_EXTRACT =
  'Be mindful of different colors of text that may indicate different speakers or emphasis.'
 const DEFAULT_PROMPT_TRANSLATE = 'Return ONLY the translated text with no additional commentary.'

+export type OcrMode = 'hybrid' | 'tesseract' | 'llm'
+
 export interface AiConfig {
  endpoint: string
  model: string
@@ -62,6 +64,9 @@ export interface AiConfig {
  maxTokensDescribe: number
  maxTokensExtract: number
  maxTokensTranslate: number
+  ocrMode: OcrMode
+  ocrLanguages: string
+  ocrConfidenceThreshold: number
 }

 export function getAiConfig(): AiConfig {
@@ -84,10 +89,15 @@ export function getAiConfig(): AiConfig {
  const maxTokensDescribe = parseInt(getSetting('ai_max_tokens_describe') ?? '8192', 10) || 8192
  const maxTokensExtract = parseInt(getSetting('ai_max_tokens_extract') ?? '8192', 10) || 8192
  const maxTokensTranslate = parseInt(getSetting('ai_max_tokens_translate') ?? '8192', 10) || 8192
+  const rawOcrMode = getSetting('ai_ocr_mode') ?? 'hybrid'
+  const ocrMode: OcrMode = rawOcrMode === 'tesseract' || rawOcrMode === 'llm' ? rawOcrMode : 'hybrid'
+  const ocrLanguages = getSetting('ai_ocr_languages') ?? 'eng'
+  const ocrConfidenceThreshold = parseInt(getSetting('ai_ocr_confidence_threshold') ?? '70', 10) || 70
  return {
    endpoint, model, modelTagging, modelDescribe, modelExtract, modelTranslate, enabled,
    promptDescribe, promptTagger, promptExtract, promptTranslate,
    maxTokensTag, maxTokensDescribe, maxTokensExtract, maxTokensTranslate,
+    ocrMode, ocrLanguages, ocrConfidenceThreshold,
  }
 }

@@ -107,6 +117,9 @@ export function updateAiConfig(
  maxTokensDescribe?: number,
  maxTokensExtract?: number,
  maxTokensTranslate?: number,
+  ocrMode?: OcrMode,
+  ocrLanguages?: string,
+  ocrConfidenceThreshold?: number,
 ): void {
  setSetting('ai_endpoint', endpoint)
  setSetting('ai_model', model)
@@ -123,6 +136,9 @@ export function updateAiConfig(
  if (maxTokensDescribe !== undefined) setSetting('ai_max_tokens_describe', String(Math.max(1, Math.floor(maxTokensDescribe))))
  if (maxTokensExtract !== undefined) setSetting('ai_max_tokens_extract', String(Math.max(1, Math.floor(maxTokensExtract))))
  if (maxTokensTranslate !== undefined) setSetting('ai_max_tokens_translate', String(Math.max(1, Math.floor(maxTokensTranslate))))
+  if (ocrMode !== undefined) setSetting('ai_ocr_mode', ocrMode)
+  if (ocrLanguages !== undefined) setSetting('ai_ocr_languages', ocrLanguages.trim() || 'eng')
+  if (ocrConfidenceThreshold !== undefined) setSetting('ai_ocr_confidence_threshold', String(Math.max(0, Math.min(100, Math.floor(ocrConfidenceThreshold)))))
 }

 export function getPreferredLanguage(): string {
@@ -249,6 +265,9 @@ export function getEffectiveAiConfig(libraryId: string): AiConfig {
    maxTokensDescribe: overrides.maxTokensDescribe ?? global.maxTokensDescribe,
    maxTokensExtract: overrides.maxTokensExtract ?? global.maxTokensExtract,
    maxTokensTranslate: overrides.maxTokensTranslate ?? global.maxTokensTranslate,
+    ocrMode: global.ocrMode,
+    ocrLanguages: global.ocrLanguages,
+    ocrConfidenceThreshold: global.ocrConfidenceThreshold,
  }
 }

--- a/src/lib/thumbnails.ts
+++ b/src/lib/thumbnails.ts
@@ -60,6 +60,19 @@ async function generateAiImage(src: string, dest: string): Promise<void> {
  fs.renameSync(tmp, dest)
 }

+/** Generate a grayscale, contrast-normalised PNG for local OCR (Tesseract).
+ *  PNG is lossless and avoids JPEG artefacts that can degrade OCR accuracy. */
+async function generateOcrImage(src: string, dest: string): Promise<void> {
+  const tmp = dest + '.tmp'
+  await sharp(src)
+    .resize(AI_IMAGE_WIDTH, undefined, { withoutEnlargement: true })
+    .grayscale()
+    .normalise()
+    .png()
+    .toFile(tmp)
+  fs.renameSync(tmp, dest)
+}
+
 /** Run a child process and collect stderr. Resolves on exit code 0, rejects otherwise. */
 function run(bin: string, args: string[]): Promise<void> {
  return new Promise((resolve, reject) => {
@@ -190,6 +203,24 @@ export async function getAiImagePath(
  return cacheFile
 }

+/**
+ * Returns the absolute path to a preprocessed PNG suitable for local OCR.
+ * The image is converted to grayscale and contrast-normalised for better
+ * Tesseract accuracy. Cached with an `_ocr` suffix.
+ */
+export async function getOcrImagePath(
+  absoluteFilePath: string,
+  libraryId: string
+): Promise<string> {
+  ensureCacheDir()
+  const key = cacheKey(libraryId, absoluteFilePath)
+  const cacheFile = path.join(CACHE_DIR, key + '_ocr.png')
+  const cached = getCachedPath(cacheFile, absoluteFilePath)
+  if (cached) return cached
+  await generateOcrImage(absoluteFilePath, cacheFile)
+  return cacheFile
+}
+
 /**
 * Returns the absolute path to a cached thumbnail JPEG for the given file.
 * Generates it on first call (or when the source has been modified).