reduce api calls for text extraction

2026-04-13 11:18:39 -04:00
parent 68b1ed94ea
commit 8557c80c52
1 changed files with 59 additions and 5 deletions
--- a/src/lib/ai-tagger.ts
+++ b/src/lib/ai-tagger.ts
@@ -554,6 +554,24 @@ export async function generateItemDescription(itemKey: string): Promise<string>
 * If the extracted text is not in the user's preferred language, auto-translates it.
 * Returns { extractedText, translatedText }.
 */
+/**
+ * Parse a structured extraction response from the AI.
+ * Returns null if the response cannot be parsed as valid JSON with the expected shape.
+ */
+function parseStructuredExtraction(raw: string): { text: string; needsTranslation: boolean } | null {
+  const jsonMatch = raw.match(/\{[\s\S]*\}/)
+  if (!jsonMatch) return null
+  try {
+    const parsed = JSON.parse(jsonMatch[0])
+    if (typeof parsed.text === 'string' && typeof parsed.needsTranslation === 'boolean') {
+      return { text: parsed.text, needsTranslation: parsed.needsTranslation }
+    }
+  } catch {
+    // fall through
+  }
+  return null
+}
+
 export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> {
  const libraryId = itemKey.split(':')[0]
  const config = getEffectiveAiConfig(libraryId)
@@ -590,9 +608,46 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
  const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
  const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]

-  const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${config.promptExtract ? ' ' + config.promptExtract : ''} If there is no text in the image, respond with exactly: [NO TEXT]`
+  const preferredLanguage = getPreferredLanguage()
+  const customInstruction = config.promptExtract ? ' ' + config.promptExtract : ''

-  const extractedText = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt)
+  // When a preferred language is configured, ask the AI to also flag whether translation is needed.
+  // This avoids a separate translation API call for text already in the target language.
+  let systemPrompt: string
+  if (preferredLanguage) {
+    systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction}
+
+Respond ONLY with a valid JSON object — no markdown, no explanation:
+{"needsTranslation": boolean, "text": "extracted text"}
+
+Rules:
+- Set needsTranslation to true if the text is NOT already written in ${preferredLanguage}.
+- Set needsTranslation to false if the text IS in ${preferredLanguage}, or if there is no text.
+- If there is no text in the image, use exactly: {"needsTranslation": false, "text": "[NO TEXT]"}`
+  } else {
+    systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]`
+  }
+
+  const rawResponse = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt)
+
+  // Parse the response — structured JSON when a preferred language is set, plain text otherwise
+  let extractedText: string
+  let needsTranslation: boolean
+
+  if (preferredLanguage) {
+    const parsed = parseStructuredExtraction(rawResponse)
+    if (parsed) {
+      extractedText = parsed.text
+      needsTranslation = parsed.needsTranslation
+    } else {
+      // Malformed JSON fallback: treat raw response as plain text and attempt translation
+      extractedText = rawResponse
+      needsTranslation = true
+    }
+  } else {
+    extractedText = rawResponse
+    needsTranslation = false
+  }

  if (!extractedText || extractedText === '[NO TEXT]') {
    db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey)
@@ -601,10 +656,9 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:

  db.prepare('UPDATE media_items SET extracted_text = ? WHERE item_key = ?').run(extractedText, itemKey)

-  // Auto-translate if preferred language is set
-  const preferredLanguage = getPreferredLanguage()
+  // Only translate if the extraction step determined the text is not already in the preferred language
  let translatedText: string | null = null
-  if (preferredLanguage) {
+  if (preferredLanguage && needsTranslation) {
    const translateModel = config.modelTranslate || config.model
    try {
      translatedText = await translateText(config.endpoint, translateModel, extractedText, preferredLanguage, config.promptTranslate)