From 8557c80c520ae660cc16f8ff8a77a2f6aa49d572 Mon Sep 17 00:00:00 2001 From: Garret Patti <42485635+garretpatti@users.noreply.github.com> Date: Mon, 13 Apr 2026 11:18:39 -0400 Subject: [PATCH] reduce api calls for text extraction --- src/lib/ai-tagger.ts | 64 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/src/lib/ai-tagger.ts b/src/lib/ai-tagger.ts index c9698c4..334a902 100644 --- a/src/lib/ai-tagger.ts +++ b/src/lib/ai-tagger.ts @@ -554,6 +554,24 @@ export async function generateItemDescription(itemKey: string): Promise * If the extracted text is not in the user's preferred language, auto-translates it. * Returns { extractedText, translatedText }. */ +/** + * Parse a structured extraction response from the AI. + * Returns null if the response cannot be parsed as valid JSON with the expected shape. + */ +function parseStructuredExtraction(raw: string): { text: string; needsTranslation: boolean } | null { + const jsonMatch = raw.match(/\{[\s\S]*\}/) + if (!jsonMatch) return null + try { + const parsed = JSON.parse(jsonMatch[0]) + if (typeof parsed.text === 'string' && typeof parsed.needsTranslation === 'boolean') { + return { text: parsed.text, needsTranslation: parsed.needsTranslation } + } + } catch { + // fall through + } + return null +} + export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> { const libraryId = itemKey.split(':')[0] const config = getEffectiveAiConfig(libraryId) @@ -590,9 +608,46 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText: const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId) const base64Images = [fs.readFileSync(thumbnailPath, 'base64')] - const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${config.promptExtract ? ' ' + config.promptExtract : ''} If there is no text in the image, respond with exactly: [NO TEXT]` + const preferredLanguage = getPreferredLanguage() + const customInstruction = config.promptExtract ? ' ' + config.promptExtract : '' - const extractedText = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt) + // When a preferred language is configured, ask the AI to also flag whether translation is needed. + // This avoids a separate translation API call for text already in the target language. + let systemPrompt: string + if (preferredLanguage) { + systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} + +Respond ONLY with a valid JSON object — no markdown, no explanation: +{"needsTranslation": boolean, "text": "extracted text"} + +Rules: +- Set needsTranslation to true if the text is NOT already written in ${preferredLanguage}. +- Set needsTranslation to false if the text IS in ${preferredLanguage}, or if there is no text. +- If there is no text in the image, use exactly: {"needsTranslation": false, "text": "[NO TEXT]"}` + } else { + systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]` + } + + const rawResponse = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt) + + // Parse the response — structured JSON when a preferred language is set, plain text otherwise + let extractedText: string + let needsTranslation: boolean + + if (preferredLanguage) { + const parsed = parseStructuredExtraction(rawResponse) + if (parsed) { + extractedText = parsed.text + needsTranslation = parsed.needsTranslation + } else { + // Malformed JSON fallback: treat raw response as plain text and attempt translation + extractedText = rawResponse + needsTranslation = true + } + } else { + extractedText = rawResponse + needsTranslation = false + } if (!extractedText || extractedText === '[NO TEXT]') { db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey) @@ -601,10 +656,9 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText: db.prepare('UPDATE media_items SET extracted_text = ? WHERE item_key = ?').run(extractedText, itemKey) - // Auto-translate if preferred language is set - const preferredLanguage = getPreferredLanguage() + // Only translate if the extraction step determined the text is not already in the preferred language let translatedText: string | null = null - if (preferredLanguage) { + if (preferredLanguage && needsTranslation) { const translateModel = config.modelTranslate || config.model try { translatedText = await translateText(config.endpoint, translateModel, extractedText, preferredLanguage, config.promptTranslate)