separate text extraction and translation
This commit is contained in:
@@ -414,7 +414,7 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
|
|||||||
;(e.currentTarget as HTMLElement).style.color = 'var(--text-secondary)'
|
;(e.currentTarget as HTMLElement).style.color = 'var(--text-secondary)'
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
{retranslating ? '⟳ Translating…' : '🌐 Re-translate'}
|
{retranslating ? '⟳ Translating…' : translatedText ? '🌐 Re-translate' : '🌐 Translate'}
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -453,6 +453,18 @@ export default function MixedView({ libraryId, initialPath }: Props) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
|
onTranslate={async (e) => {
|
||||||
|
const itemKey = itemKeyFor(e)
|
||||||
|
const res = await fetch('/api/ai-tagging/translate', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ itemKey }),
|
||||||
|
})
|
||||||
|
if (!res.ok) {
|
||||||
|
const data = await res.json().catch(() => ({}))
|
||||||
|
throw new Error((data as { error?: string }).error ?? 'Translation failed')
|
||||||
|
}
|
||||||
|
}}
|
||||||
onDelete={(e) => {
|
onDelete={(e) => {
|
||||||
const rel = filtersActive ? e.name : (currentPath ? `${currentPath}/${e.name}` : e.name)
|
const rel = filtersActive ? e.name : (currentPath ? `${currentPath}/${e.name}` : e.name)
|
||||||
fetch(`/api/browse?libraryId=${encodeURIComponent(libraryId)}&path=${encodeURIComponent(rel)}`, { method: 'DELETE' })
|
fetch(`/api/browse?libraryId=${encodeURIComponent(libraryId)}&path=${encodeURIComponent(rel)}`, { method: 'DELETE' })
|
||||||
@@ -582,7 +594,7 @@ export default function MixedView({ libraryId, initialPath }: Props) {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise<boolean>; onAiTag?: (e: FileEntry) => Promise<void>; onExtractText?: (e: FileEntry) => Promise<void>; onDescribe?: (e: FileEntry) => Promise<void> }) {
|
function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe, onTranslate }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise<boolean>; onAiTag?: (e: FileEntry) => Promise<void>; onExtractText?: (e: FileEntry) => Promise<void>; onDescribe?: (e: FileEntry) => Promise<void>; onTranslate?: (e: FileEntry) => Promise<void> }) {
|
||||||
type ImgState = 'loading' | 'loaded' | 'error'
|
type ImgState = 'loading' | 'loaded' | 'error'
|
||||||
const [imgState, setImgState] = useState<ImgState>(
|
const [imgState, setImgState] = useState<ImgState>(
|
||||||
entry.thumbnailUrl ? 'loading' : 'error'
|
entry.thumbnailUrl ? 'loading' : 'error'
|
||||||
@@ -601,6 +613,8 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac
|
|||||||
const [textExtractError, setTextExtractError] = useState<string | null>(null)
|
const [textExtractError, setTextExtractError] = useState<string | null>(null)
|
||||||
const [describing, setDescribing] = useState(false)
|
const [describing, setDescribing] = useState(false)
|
||||||
const [describeError, setDescribeError] = useState<string | null>(null)
|
const [describeError, setDescribeError] = useState<string | null>(null)
|
||||||
|
const [translating, setTranslating] = useState(false)
|
||||||
|
const [translateError, setTranslateError] = useState<string | null>(null)
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!menuOpen) return
|
if (!menuOpen) return
|
||||||
@@ -830,6 +844,26 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac
|
|||||||
🔍 Extract Text for Folder
|
🔍 Extract Text for Folder
|
||||||
</button>
|
</button>
|
||||||
)}
|
)}
|
||||||
|
{onTranslate && entry.mediaType === 'image' && (
|
||||||
|
<button
|
||||||
|
onClick={(e) => {
|
||||||
|
e.stopPropagation()
|
||||||
|
setMenuOpen(false)
|
||||||
|
setTranslating(true)
|
||||||
|
setTranslateError(null)
|
||||||
|
onTranslate(entry)
|
||||||
|
.catch((err) => setTranslateError(err instanceof Error ? err.message : 'Translation failed'))
|
||||||
|
.finally(() => setTranslating(false))
|
||||||
|
}}
|
||||||
|
disabled={translating}
|
||||||
|
className="flex items-center gap-2 w-full px-4 py-2 text-sm text-left transition-colors disabled:opacity-50"
|
||||||
|
style={{ color: 'var(--text-primary)' }}
|
||||||
|
onMouseEnter={(e) => ((e.currentTarget as HTMLElement).style.backgroundColor = 'var(--border)')}
|
||||||
|
onMouseLeave={(e) => ((e.currentTarget as HTMLElement).style.backgroundColor = 'transparent')}
|
||||||
|
>
|
||||||
|
🌐 Translate
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
{onRename && (
|
{onRename && (
|
||||||
<button
|
<button
|
||||||
onClick={(e) => {
|
onClick={(e) => {
|
||||||
@@ -929,6 +963,28 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac
|
|||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* Translation status overlay */}
|
||||||
|
{(translating || translateError) && (
|
||||||
|
<div
|
||||||
|
className="absolute inset-x-0 bottom-0 z-10 px-2 py-1.5 text-xs"
|
||||||
|
style={{ backgroundColor: translateError ? 'rgba(127,29,29,0.9)' : 'rgba(0,0,0,0.75)' }}
|
||||||
|
onClick={(e) => e.stopPropagation()}
|
||||||
|
>
|
||||||
|
<span style={{ color: translateError ? '#fca5a5' : 'var(--text-secondary)' }}>
|
||||||
|
{translateError ?? 'Translating…'}
|
||||||
|
</span>
|
||||||
|
{translateError && (
|
||||||
|
<button
|
||||||
|
onClick={() => setTranslateError(null)}
|
||||||
|
className="ml-2 underline text-xs"
|
||||||
|
style={{ color: '#fca5a5' }}
|
||||||
|
>
|
||||||
|
dismiss
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Delete confirmation overlay */}
|
{/* Delete confirmation overlay */}
|
||||||
{confirming && (
|
{confirming && (
|
||||||
<div
|
<div
|
||||||
|
|||||||
@@ -511,27 +511,9 @@ export async function generateItemDescription(itemKey: string): Promise<string>
|
|||||||
/**
|
/**
|
||||||
* Extract text (OCR) from an image using the vision model.
|
* Extract text (OCR) from an image using the vision model.
|
||||||
* Only works for images in mixed libraries.
|
* Only works for images in mixed libraries.
|
||||||
* If the extracted text is not in the user's preferred language, auto-translates it.
|
* Translation is not performed automatically — call translateItemText() separately.
|
||||||
* Returns { extractedText, translatedText }.
|
* Returns { extractedText, translatedText } where translatedText is always null.
|
||||||
*/
|
*/
|
||||||
/**
|
|
||||||
* Parse a structured extraction response from the AI.
|
|
||||||
* Returns null if the response cannot be parsed as valid JSON with the expected shape.
|
|
||||||
*/
|
|
||||||
function parseStructuredExtraction(raw: string): { text: string; needsTranslation: boolean } | null {
|
|
||||||
const jsonMatch = raw.match(/\{[\s\S]*\}/)
|
|
||||||
if (!jsonMatch) return null
|
|
||||||
try {
|
|
||||||
const parsed = JSON.parse(jsonMatch[0])
|
|
||||||
if (typeof parsed.text === 'string' && typeof parsed.needsTranslation === 'boolean') {
|
|
||||||
return { text: parsed.text, needsTranslation: parsed.needsTranslation }
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// fall through
|
|
||||||
}
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> {
|
export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> {
|
||||||
const libraryId = itemKey.split(':')[0]
|
const libraryId = itemKey.split(':')[0]
|
||||||
const config = getEffectiveAiConfig(libraryId)
|
const config = getEffectiveAiConfig(libraryId)
|
||||||
@@ -568,69 +550,19 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
|
|||||||
const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
|
const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
|
||||||
const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
|
const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
|
||||||
|
|
||||||
const preferredLanguage = getPreferredLanguage()
|
|
||||||
const customInstruction = config.promptExtract ? ' ' + config.promptExtract : ''
|
const customInstruction = config.promptExtract ? ' ' + config.promptExtract : ''
|
||||||
|
const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]`
|
||||||
|
|
||||||
// When a preferred language is configured, ask the AI to also flag whether translation is needed.
|
const extractedText = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt, config.maxTokensExtract)
|
||||||
// This avoids a separate translation API call for text already in the target language.
|
|
||||||
let systemPrompt: string
|
|
||||||
if (preferredLanguage) {
|
|
||||||
systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction}
|
|
||||||
|
|
||||||
Respond ONLY with a valid JSON object — no markdown, no explanation:
|
|
||||||
{"needsTranslation": boolean, "text": "extracted text"}
|
|
||||||
|
|
||||||
Rules:
|
|
||||||
- Set needsTranslation to true if the text is NOT already written in ${preferredLanguage}.
|
|
||||||
- Set needsTranslation to false if the text IS in ${preferredLanguage}, or if there is no text.
|
|
||||||
- If there is no text in the image, use exactly: {"needsTranslation": false, "text": "[NO TEXT]"}`
|
|
||||||
} else {
|
|
||||||
systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]`
|
|
||||||
}
|
|
||||||
|
|
||||||
const rawResponse = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt, config.maxTokensExtract)
|
|
||||||
|
|
||||||
// Parse the response — structured JSON when a preferred language is set, plain text otherwise
|
|
||||||
let extractedText: string
|
|
||||||
let needsTranslation: boolean
|
|
||||||
|
|
||||||
if (preferredLanguage) {
|
|
||||||
const parsed = parseStructuredExtraction(rawResponse)
|
|
||||||
if (parsed) {
|
|
||||||
extractedText = parsed.text
|
|
||||||
needsTranslation = parsed.needsTranslation
|
|
||||||
} else {
|
|
||||||
// Malformed JSON fallback: treat raw response as plain text and attempt translation
|
|
||||||
extractedText = rawResponse
|
|
||||||
needsTranslation = true
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
extractedText = rawResponse
|
|
||||||
needsTranslation = false
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!extractedText || extractedText === '[NO TEXT]') {
|
if (!extractedText || extractedText === '[NO TEXT]') {
|
||||||
db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey)
|
db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey)
|
||||||
return { extractedText: '', translatedText: null }
|
return { extractedText: '', translatedText: null }
|
||||||
}
|
}
|
||||||
|
|
||||||
db.prepare('UPDATE media_items SET extracted_text = ? WHERE item_key = ?').run(extractedText, itemKey)
|
db.prepare('UPDATE media_items SET extracted_text = ?, extracted_text_translated = NULL WHERE item_key = ?').run(extractedText, itemKey)
|
||||||
|
|
||||||
// Only translate if the extraction step determined the text is not already in the preferred language
|
return { extractedText, translatedText: null }
|
||||||
let translatedText: string | null = null
|
|
||||||
if (preferredLanguage && needsTranslation) {
|
|
||||||
const translateModel = config.modelTranslate || config.model
|
|
||||||
try {
|
|
||||||
translatedText = await translateText(config.endpoint, translateModel, extractedText, preferredLanguage, config.promptTranslate, config.maxTokensTranslate)
|
|
||||||
if (translatedText) {
|
|
||||||
db.prepare('UPDATE media_items SET extracted_text_translated = ? WHERE item_key = ?').run(translatedText, itemKey)
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
console.warn(`[ai-tagger] Translation failed for "${itemKey}":`, err instanceof Error ? err.message : err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return { extractedText, translatedText }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user