From 1350a6f94bc0c2da1ba1f6676650271f1acd7ecc Mon Sep 17 00:00:00 2001 From: Garret Patti <42485635+garretpatti@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:45:00 -0400 Subject: [PATCH] separate text extraction and translation --- src/components/mixed/ImageLightbox.tsx | 2 +- src/components/mixed/MixedView.tsx | 58 ++++++++++++++++++- src/lib/ai-tagger.ts | 80 ++------------------------ 3 files changed, 64 insertions(+), 76 deletions(-) diff --git a/src/components/mixed/ImageLightbox.tsx b/src/components/mixed/ImageLightbox.tsx index 5c60986..c9117b2 100644 --- a/src/components/mixed/ImageLightbox.tsx +++ b/src/components/mixed/ImageLightbox.tsx @@ -414,7 +414,7 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item ;(e.currentTarget as HTMLElement).style.color = 'var(--text-secondary)' }} > - {retranslating ? '⟳ Translating…' : '🌐 Re-translate'} + {retranslating ? '⟳ Translating…' : translatedText ? '🌐 Re-translate' : '🌐 Translate'} diff --git a/src/components/mixed/MixedView.tsx b/src/components/mixed/MixedView.tsx index de652f9..07e93db 100644 --- a/src/components/mixed/MixedView.tsx +++ b/src/components/mixed/MixedView.tsx @@ -453,6 +453,18 @@ export default function MixedView({ libraryId, initialPath }: Props) { } } }} + onTranslate={async (e) => { + const itemKey = itemKeyFor(e) + const res = await fetch('/api/ai-tagging/translate', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ itemKey }), + }) + if (!res.ok) { + const data = await res.json().catch(() => ({})) + throw new Error((data as { error?: string }).error ?? 'Translation failed') + } + }} onDelete={(e) => { const rel = filtersActive ? e.name : (currentPath ? `${currentPath}/${e.name}` : e.name) fetch(`/api/browse?libraryId=${encodeURIComponent(libraryId)}&path=${encodeURIComponent(rel)}`, { method: 'DELETE' }) @@ -582,7 +594,7 @@ export default function MixedView({ libraryId, initialPath }: Props) { ) } -function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise; onAiTag?: (e: FileEntry) => Promise; onExtractText?: (e: FileEntry) => Promise; onDescribe?: (e: FileEntry) => Promise }) { +function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe, onTranslate }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise; onAiTag?: (e: FileEntry) => Promise; onExtractText?: (e: FileEntry) => Promise; onDescribe?: (e: FileEntry) => Promise; onTranslate?: (e: FileEntry) => Promise }) { type ImgState = 'loading' | 'loaded' | 'error' const [imgState, setImgState] = useState( entry.thumbnailUrl ? 'loading' : 'error' @@ -601,6 +613,8 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac const [textExtractError, setTextExtractError] = useState(null) const [describing, setDescribing] = useState(false) const [describeError, setDescribeError] = useState(null) + const [translating, setTranslating] = useState(false) + const [translateError, setTranslateError] = useState(null) useEffect(() => { if (!menuOpen) return @@ -830,6 +844,26 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac 🔍 Extract Text for Folder )} + {onTranslate && entry.mediaType === 'image' && ( + + )} {onRename && ( + )} + + )} + {/* Delete confirmation overlay */} {confirming && (
/** * Extract text (OCR) from an image using the vision model. * Only works for images in mixed libraries. - * If the extracted text is not in the user's preferred language, auto-translates it. - * Returns { extractedText, translatedText }. + * Translation is not performed automatically — call translateItemText() separately. + * Returns { extractedText, translatedText } where translatedText is always null. */ -/** - * Parse a structured extraction response from the AI. - * Returns null if the response cannot be parsed as valid JSON with the expected shape. - */ -function parseStructuredExtraction(raw: string): { text: string; needsTranslation: boolean } | null { - const jsonMatch = raw.match(/\{[\s\S]*\}/) - if (!jsonMatch) return null - try { - const parsed = JSON.parse(jsonMatch[0]) - if (typeof parsed.text === 'string' && typeof parsed.needsTranslation === 'boolean') { - return { text: parsed.text, needsTranslation: parsed.needsTranslation } - } - } catch { - // fall through - } - return null -} - export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> { const libraryId = itemKey.split(':')[0] const config = getEffectiveAiConfig(libraryId) @@ -568,69 +550,19 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText: const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId) const base64Images = [fs.readFileSync(thumbnailPath, 'base64')] - const preferredLanguage = getPreferredLanguage() const customInstruction = config.promptExtract ? ' ' + config.promptExtract : '' + const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]` - // When a preferred language is configured, ask the AI to also flag whether translation is needed. - // This avoids a separate translation API call for text already in the target language. - let systemPrompt: string - if (preferredLanguage) { - systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} - -Respond ONLY with a valid JSON object — no markdown, no explanation: -{"needsTranslation": boolean, "text": "extracted text"} - -Rules: -- Set needsTranslation to true if the text is NOT already written in ${preferredLanguage}. -- Set needsTranslation to false if the text IS in ${preferredLanguage}, or if there is no text. -- If there is no text in the image, use exactly: {"needsTranslation": false, "text": "[NO TEXT]"}` - } else { - systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]` - } - - const rawResponse = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt, config.maxTokensExtract) - - // Parse the response — structured JSON when a preferred language is set, plain text otherwise - let extractedText: string - let needsTranslation: boolean - - if (preferredLanguage) { - const parsed = parseStructuredExtraction(rawResponse) - if (parsed) { - extractedText = parsed.text - needsTranslation = parsed.needsTranslation - } else { - // Malformed JSON fallback: treat raw response as plain text and attempt translation - extractedText = rawResponse - needsTranslation = true - } - } else { - extractedText = rawResponse - needsTranslation = false - } + const extractedText = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt, config.maxTokensExtract) if (!extractedText || extractedText === '[NO TEXT]') { db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey) return { extractedText: '', translatedText: null } } - db.prepare('UPDATE media_items SET extracted_text = ? WHERE item_key = ?').run(extractedText, itemKey) + db.prepare('UPDATE media_items SET extracted_text = ?, extracted_text_translated = NULL WHERE item_key = ?').run(extractedText, itemKey) - // Only translate if the extraction step determined the text is not already in the preferred language - let translatedText: string | null = null - if (preferredLanguage && needsTranslation) { - const translateModel = config.modelTranslate || config.model - try { - translatedText = await translateText(config.endpoint, translateModel, extractedText, preferredLanguage, config.promptTranslate, config.maxTokensTranslate) - if (translatedText) { - db.prepare('UPDATE media_items SET extracted_text_translated = ? WHERE item_key = ?').run(translatedText, itemKey) - } - } catch (err) { - console.warn(`[ai-tagger] Translation failed for "${itemKey}":`, err instanceof Error ? err.message : err) - } - } - - return { extractedText, translatedText } + return { extractedText, translatedText: null } } /**