feat: per-extraction OCR language override

Allow users to specify a Tesseract language string (e.g. jpn+jpn_vert)
on a per-extraction basis, overriding the global OCR language setting.

- Add payload column to ai_jobs table (migration) to carry per-call data
- Thread ocrLanguages payload through enqueueJob → processNextJob → extractItemText
- New GET /api/ai-settings/ocr endpoint (requireAuth) returns { ocrMode, ocrLanguages }
- ImageLightbox fetches OCR settings and shows a language input next to the
  Extract Text button when mode is hybrid or tesseract (hidden for llm-only)
- MixedView fetches OCR settings and passes them down to EntryTile; kebab
  Extract Text on images shows an inline language prompt before dispatching the job

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Garret Patti
2026-04-13 21:55:07 -04:00
parent 96cfb8aae7
commit db2e446ef4
8 changed files with 206 additions and 70 deletions

View File

@@ -0,0 +1,11 @@
import { NextRequest, NextResponse } from 'next/server'
import { requireAuth } from '@/lib/auth'
import { getAiConfig } from '@/lib/app-settings'
export async function GET(request: NextRequest) {
const auth = await requireAuth(request)
if (auth instanceof NextResponse) return auth
const { ocrMode, ocrLanguages } = getAiConfig()
return NextResponse.json({ ocrMode, ocrLanguages })
}

View File

@@ -3,14 +3,14 @@ import { requireLibraryAccess } from '@/lib/auth'
import { enqueueJob } from '@/lib/ai-jobs' import { enqueueJob } from '@/lib/ai-jobs'
export async function POST(request: NextRequest) { export async function POST(request: NextRequest) {
let body: { itemKey?: string } let body: { itemKey?: string; ocrLanguages?: string }
try { try {
body = await request.json() body = await request.json()
} catch { } catch {
return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 }) return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 })
} }
const { itemKey } = body const { itemKey, ocrLanguages } = body
if (!itemKey || typeof itemKey !== 'string') { if (!itemKey || typeof itemKey !== 'string') {
return NextResponse.json({ error: 'itemKey is required' }, { status: 400 }) return NextResponse.json({ error: 'itemKey is required' }, { status: 400 })
} }
@@ -19,6 +19,12 @@ export async function POST(request: NextRequest) {
const auth = await requireLibraryAccess(request, libraryId) const auth = await requireLibraryAccess(request, libraryId)
if (auth instanceof NextResponse) return auth if (auth instanceof NextResponse) return auth
const jobId = enqueueJob(itemKey, 'extract', libraryId) const jobId = enqueueJob(
itemKey,
'extract',
libraryId,
undefined,
ocrLanguages ? { ocrLanguages } : undefined,
)
return NextResponse.json({ jobId }, { status: 202 }) return NextResponse.json({ jobId }, { status: 202 })
} }

View File

@@ -336,7 +336,7 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose,
{/* Text overlay */} {/* Text overlay */}
{showTextOverlay && displayText && ( {showTextOverlay && displayText && (
<div <div
className="absolute bottom-16 left-4 right-4 z-20 rounded-xl p-4" className="absolute bottom-4 left-4 right-4 z-20 rounded-xl p-4 max-w-fit"
style={{ backgroundColor: 'rgba(0,0,0,0.75)' }} style={{ backgroundColor: 'rgba(0,0,0,0.75)' }}
onClick={(e) => e.stopPropagation()} onClick={(e) => e.stopPropagation()}
> >

View File

@@ -39,6 +39,11 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
const [descPending, setDescPending] = useState(false) const [descPending, setDescPending] = useState(false)
const [descError, setDescError] = useState<string | null>(null) const [descError, setDescError] = useState<string | null>(null)
// OCR settings
const [ocrMode, setOcrMode] = useState<string | null>(null)
const [defaultOcrLanguages, setDefaultOcrLanguages] = useState('eng')
const [ocrLanguageInput, setOcrLanguageInput] = useState('')
// Text overlay state // Text overlay state
const [showTextOverlay, setShowTextOverlay] = useState(false) const [showTextOverlay, setShowTextOverlay] = useState(false)
const [showOriginal, setShowOriginal] = useState(false) const [showOriginal, setShowOriginal] = useState(false)
@@ -68,6 +73,13 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
useEffect(() => { useEffect(() => {
fetchAiFields() fetchAiFields()
fetch('/api/ai-settings/ocr')
.then((r) => r.json())
.then((d: { ocrMode: string; ocrLanguages: string }) => {
setOcrMode(d.ocrMode)
setDefaultOcrLanguages(d.ocrLanguages)
})
.catch(() => {})
return () => { return () => {
if (pollRef.current) clearInterval(pollRef.current) if (pollRef.current) clearInterval(pollRef.current)
} }
@@ -439,58 +451,79 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
Text Extraction Text Extraction
</p> </p>
<button <div className="flex items-center gap-2 flex-wrap">
onClick={async () => { <button
setExtracting(true) onClick={async () => {
setExtractError(null) setExtracting(true)
setExtractPending(false) setExtractError(null)
try { setExtractPending(false)
const res = await fetch('/api/ai-tagging/extract-text', { try {
method: 'POST', const res = await fetch('/api/ai-tagging/extract-text', {
headers: { 'Content-Type': 'application/json' }, method: 'POST',
body: JSON.stringify({ itemKey }), headers: { 'Content-Type': 'application/json' },
}) body: JSON.stringify({
if (res.status === 202) { itemKey,
setExtractPending(true) ...(ocrLanguageInput.trim() && { ocrLanguages: ocrLanguageInput.trim() }),
startPolling(extractedText, translatedText, aiDescription) }),
return })
if (res.status === 202) {
setExtractPending(true)
startPolling(extractedText, translatedText, aiDescription)
return
}
if (!res.ok) {
const data = await res.json().catch(() => ({}))
throw new Error((data as { error?: string }).error ?? 'Failed to extract text')
}
const result = await res.json()
setExtractedText(result.extractedText || null)
setEditedExtractedText(result.extractedText || '')
setTranslatedText(result.translatedText || null)
} catch (err) {
setExtractError(err instanceof Error ? err.message : 'Failed to extract text')
setTimeout(() => setExtractError(null), 4000)
} finally {
setExtracting(false)
} }
if (!res.ok) { }}
const data = await res.json().catch(() => ({})) disabled={extracting || extractPending}
throw new Error((data as { error?: string }).error ?? 'Failed to extract text') className="text-xs px-2 py-1 rounded-lg transition-colors disabled:opacity-50 self-start flex-shrink-0"
style={{
backgroundColor: extractPending ? 'var(--accent)' : 'var(--border)',
color: extractPending ? '#fff' : 'var(--text-secondary)',
}}
onMouseEnter={(e) => {
if (!extracting && !extractPending) {
;(e.currentTarget as HTMLElement).style.backgroundColor = 'var(--text-secondary)'
;(e.currentTarget as HTMLElement).style.color = 'var(--background)'
} }
const result = await res.json() }}
setExtractedText(result.extractedText || null) onMouseLeave={(e) => {
setEditedExtractedText(result.extractedText || '') if (!extractPending) {
setTranslatedText(result.translatedText || null) ;(e.currentTarget as HTMLElement).style.backgroundColor = 'var(--border)'
} catch (err) { ;(e.currentTarget as HTMLElement).style.color = 'var(--text-secondary)'
setExtractError(err instanceof Error ? err.message : 'Failed to extract text') }
setTimeout(() => setExtractError(null), 4000) }}
} finally { >
setExtracting(false) {extracting ? '⟳ Extracting…' : extractPending ? '⟳ Queued…' : extractedText ? '🔍 Re-extract Text' : '🔍 Extract Text'}
} </button>
}} {ocrMode && ocrMode !== 'llm' && (
disabled={extracting || extractPending} <input
className="text-xs px-2 py-1 rounded-lg transition-colors disabled:opacity-50 self-start" type="text"
style={{ value={ocrLanguageInput}
backgroundColor: extractPending ? 'var(--accent)' : 'var(--border)', onChange={(e) => setOcrLanguageInput(e.target.value)}
color: extractPending ? '#fff' : 'var(--text-secondary)', placeholder={defaultOcrLanguages}
}} className="text-xs px-2 py-0.5 rounded-full outline-none"
onMouseEnter={(e) => { style={{
if (!extracting && !extractPending) { backgroundColor: 'var(--background)',
;(e.currentTarget as HTMLElement).style.backgroundColor = 'var(--text-secondary)' border: '1px solid var(--border)',
;(e.currentTarget as HTMLElement).style.color = 'var(--background)' color: 'var(--text-primary)',
} width: 120,
}} }}
onMouseLeave={(e) => { title="Tesseract language(s) for this extraction (e.g. jpn+jpn_vert). Leave blank to use the configured default."
if (!extractPending) { />
;(e.currentTarget as HTMLElement).style.backgroundColor = 'var(--border)' )}
;(e.currentTarget as HTMLElement).style.color = 'var(--text-secondary)' </div>
}
}}
>
{extracting ? '⟳ Extracting…' : extractPending ? '⟳ Queued…' : extractedText ? '🔍 Re-extract Text' : '🔍 Extract Text'}
</button>
{extractError && ( {extractError && (
<p className="text-xs" style={{ color: '#f87171' }}>{extractError}</p> <p className="text-xs" style={{ color: '#f87171' }}>{extractError}</p>

View File

@@ -83,6 +83,9 @@ export default function MixedView({ libraryId, initialPath }: Props) {
setDoomScrollLoading(false) setDoomScrollLoading(false)
}, [currentPath]) }, [currentPath])
const [ocrMode, setOcrMode] = useState<string | null>(null)
const [defaultOcrLanguages, setDefaultOcrLanguages] = useState('eng')
const fetchAssignments = useCallback(() => { const fetchAssignments = useCallback(() => {
fetch(`/api/tags/library-assignments?libraryId=${encodeURIComponent(libraryId)}`) fetch(`/api/tags/library-assignments?libraryId=${encodeURIComponent(libraryId)}`)
.then((r) => r.json()) .then((r) => r.json())
@@ -92,6 +95,16 @@ export default function MixedView({ libraryId, initialPath }: Props) {
useEffect(() => { fetchAssignments() }, [fetchAssignments]) useEffect(() => { fetchAssignments() }, [fetchAssignments])
useEffect(() => {
fetch('/api/ai-settings/ocr')
.then((r) => r.json())
.then((d: { ocrMode: string; ocrLanguages: string }) => {
setOcrMode(d.ocrMode)
setDefaultOcrLanguages(d.ocrLanguages)
})
.catch(() => {})
}, [])
const filtersActive = search !== '' || selectedTagIds.size > 0 const filtersActive = search !== '' || selectedTagIds.size > 0
const fetchRecursive = useCallback(() => { const fetchRecursive = useCallback(() => {
@@ -387,6 +400,8 @@ export default function MixedView({ libraryId, initialPath }: Props) {
entry={entry} entry={entry}
onOpen={handleEntry} onOpen={handleEntry}
onTag={handleTagEntry} onTag={handleTagEntry}
ocrMode={ocrMode}
defaultOcrLanguages={defaultOcrLanguages}
onAiTag={async (e) => { onAiTag={async (e) => {
const itemKey = itemKeyFor(e) const itemKey = itemKeyFor(e)
const res = await fetch('/api/ai-tagging', { const res = await fetch('/api/ai-tagging', {
@@ -401,7 +416,7 @@ export default function MixedView({ libraryId, initialPath }: Props) {
fetchAssignments() fetchAssignments()
setFilterRefreshKey((k) => k + 1) setFilterRefreshKey((k) => k + 1)
}} }}
onExtractText={async (e) => { onExtractText={async (e, ocrLanguages) => {
if (e.type === 'directory') { if (e.type === 'directory') {
// Bulk extract for directory // Bulk extract for directory
const dirRel = filtersActive ? e.name : (currentPath ? `${currentPath}/${e.name}` : e.name) const dirRel = filtersActive ? e.name : (currentPath ? `${currentPath}/${e.name}` : e.name)
@@ -420,7 +435,7 @@ export default function MixedView({ libraryId, initialPath }: Props) {
const res = await fetch('/api/ai-tagging/extract-text', { const res = await fetch('/api/ai-tagging/extract-text', {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ itemKey }), body: JSON.stringify({ itemKey, ...(ocrLanguages && { ocrLanguages }) }),
}) })
if (!res.ok) { if (!res.ok) {
const data = await res.json().catch(() => ({})) const data = await res.json().catch(() => ({}))
@@ -594,7 +609,7 @@ export default function MixedView({ libraryId, initialPath }: Props) {
) )
} }
function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe, onTranslate }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise<boolean>; onAiTag?: (e: FileEntry) => Promise<void>; onExtractText?: (e: FileEntry) => Promise<void>; onDescribe?: (e: FileEntry) => Promise<void>; onTranslate?: (e: FileEntry) => Promise<void> }) { function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe, onTranslate, ocrMode, defaultOcrLanguages }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise<boolean>; onAiTag?: (e: FileEntry) => Promise<void>; onExtractText?: (e: FileEntry, ocrLanguages?: string) => Promise<void>; onDescribe?: (e: FileEntry) => Promise<void>; onTranslate?: (e: FileEntry) => Promise<void>; ocrMode?: string | null; defaultOcrLanguages?: string }) {
type ImgState = 'loading' | 'loaded' | 'error' type ImgState = 'loading' | 'loaded' | 'error'
const [imgState, setImgState] = useState<ImgState>( const [imgState, setImgState] = useState<ImgState>(
entry.thumbnailUrl ? 'loading' : 'error' entry.thumbnailUrl ? 'loading' : 'error'
@@ -615,6 +630,8 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac
const [describeError, setDescribeError] = useState<string | null>(null) const [describeError, setDescribeError] = useState<string | null>(null)
const [translating, setTranslating] = useState(false) const [translating, setTranslating] = useState(false)
const [translateError, setTranslateError] = useState<string | null>(null) const [translateError, setTranslateError] = useState<string | null>(null)
const [showOcrPrompt, setShowOcrPrompt] = useState(false)
const [ocrLanguageInput, setOcrLanguageInput] = useState('')
useEffect(() => { useEffect(() => {
if (!menuOpen) return if (!menuOpen) return
@@ -804,16 +821,21 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac
📝 Describe Folder 📝 Describe Folder
</button> </button>
)} )}
{onExtractText && entry.mediaType === 'image' && ( {onExtractText && entry.mediaType === 'image' && !showOcrPrompt && (
<button <button
onClick={(e) => { onClick={(e) => {
e.stopPropagation() e.stopPropagation()
setMenuOpen(false) if (ocrMode && ocrMode !== 'llm') {
setTextExtracting(true) setOcrLanguageInput('')
setTextExtractError(null) setShowOcrPrompt(true)
onExtractText(entry) } else {
.catch((err) => setTextExtractError(err instanceof Error ? err.message : 'Text extraction failed')) setMenuOpen(false)
.finally(() => setTextExtracting(false)) setTextExtracting(true)
setTextExtractError(null)
onExtractText(entry)
.catch((err) => setTextExtractError(err instanceof Error ? err.message : 'Text extraction failed'))
.finally(() => setTextExtracting(false))
}
}} }}
disabled={textExtracting} disabled={textExtracting}
className="flex items-center gap-2 w-full px-4 py-2 text-sm text-left transition-colors disabled:opacity-50" className="flex items-center gap-2 w-full px-4 py-2 text-sm text-left transition-colors disabled:opacity-50"
@@ -824,6 +846,57 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac
🔍 Extract Text 🔍 Extract Text
</button> </button>
)} )}
{onExtractText && entry.mediaType === 'image' && showOcrPrompt && (
<div className="px-4 py-2 flex flex-col gap-2" onClick={(e) => e.stopPropagation()}>
<p className="text-xs" style={{ color: 'var(--text-secondary)' }}>OCR language</p>
<input
autoFocus
type="text"
value={ocrLanguageInput}
onChange={(e) => setOcrLanguageInput(e.target.value)}
onKeyDown={(e) => {
if (e.key === 'Escape') { setShowOcrPrompt(false) }
if (e.key === 'Enter') {
setShowOcrPrompt(false)
setMenuOpen(false)
setTextExtracting(true)
setTextExtractError(null)
onExtractText(entry, ocrLanguageInput.trim() || undefined)
.catch((err) => setTextExtractError(err instanceof Error ? err.message : 'Text extraction failed'))
.finally(() => setTextExtracting(false))
}
}}
placeholder={defaultOcrLanguages ?? 'eng'}
className="text-xs px-2 py-1 rounded-lg outline-none w-full"
style={{ backgroundColor: 'var(--background)', border: '1px solid var(--border)', color: 'var(--text-primary)' }}
title="Tesseract language(s) for this extraction (e.g. jpn+jpn_vert). Leave blank to use the configured default."
/>
<div className="flex gap-2">
<button
onClick={() => {
setShowOcrPrompt(false)
setMenuOpen(false)
setTextExtracting(true)
setTextExtractError(null)
onExtractText(entry, ocrLanguageInput.trim() || undefined)
.catch((err) => setTextExtractError(err instanceof Error ? err.message : 'Text extraction failed'))
.finally(() => setTextExtracting(false))
}}
className="text-xs px-2 py-1 rounded-lg"
style={{ backgroundColor: 'var(--accent)', color: '#fff' }}
>
Extract
</button>
<button
onClick={() => setShowOcrPrompt(false)}
className="text-xs px-2 py-1"
style={{ color: 'var(--text-secondary)' }}
>
Cancel
</button>
</div>
</div>
)}
{onExtractText && entry.type === 'directory' && ( {onExtractText && entry.type === 'directory' && (
<button <button
onClick={(e) => { onClick={(e) => {

View File

@@ -34,6 +34,7 @@ interface AiJobRow {
started_at: number | null started_at: number | null
completed_at: number | null completed_at: number | null
item_title: string | null item_title: string | null
payload: string | null
} }
function rowToJob(row: AiJobRow): AiJob { function rowToJob(row: AiJobRow): AiJob {
@@ -75,6 +76,7 @@ export function enqueueJob(
jobType: AiJobType, jobType: AiJobType,
libraryId: string, libraryId: string,
sourceLanguage?: string, sourceLanguage?: string,
payload?: Record<string, string>,
): string { ): string {
const db = getDb() const db = getDb()
@@ -96,9 +98,9 @@ export function enqueueJob(
const metadata = jobType === 'translate' && sourceLanguage ? sourceLanguage : null const metadata = jobType === 'translate' && sourceLanguage ? sourceLanguage : null
db.prepare( db.prepare(
`INSERT INTO ai_jobs (id, item_key, library_id, job_type, status, error, attempt, max_retries, created_at, item_title) `INSERT INTO ai_jobs (id, item_key, library_id, job_type, status, error, attempt, max_retries, created_at, item_title, payload)
VALUES (?, ?, ?, ?, 'queued', ?, 0, ?, ?, ?)` VALUES (?, ?, ?, ?, 'queued', ?, 0, ?, ?, ?, ?)`
).run(id, itemKey, libraryId, jobType, metadata, maxRetries, Date.now(), title) ).run(id, itemKey, libraryId, jobType, metadata, maxRetries, Date.now(), title, payload ? JSON.stringify(payload) : null)
// Wake the processor // Wake the processor
wakeProcessor() wakeProcessor()
@@ -251,6 +253,8 @@ async function processNextJob(): Promise<boolean> {
// Extract sourceLanguage for translate jobs (stored in error field at enqueue) // Extract sourceLanguage for translate jobs (stored in error field at enqueue)
const sourceLanguage = row.job_type === 'translate' ? row.error : null const sourceLanguage = row.job_type === 'translate' ? row.error : null
// Parse job payload (carries per-call overrides, e.g. ocrLanguages for extract jobs)
const jobPayload = row.payload ? (JSON.parse(row.payload) as Record<string, string>) : null
db.prepare( db.prepare(
"UPDATE ai_jobs SET status = 'running', started_at = ?, error = NULL WHERE id = ?" "UPDATE ai_jobs SET status = 'running', started_at = ?, error = NULL WHERE id = ?"
@@ -265,7 +269,7 @@ async function processNextJob(): Promise<boolean> {
await generateItemDescription(row.item_key) await generateItemDescription(row.item_key)
break break
case 'extract': case 'extract':
await extractItemText(row.item_key) await extractItemText(row.item_key, jobPayload?.ocrLanguages)
break break
case 'translate': case 'translate':
await translateItemText(row.item_key, sourceLanguage || undefined) await translateItemText(row.item_key, sourceLanguage || undefined)

View File

@@ -538,7 +538,7 @@ async function extractWithTesseract(
* Translation is not performed automatically — call translateItemText() separately. * Translation is not performed automatically — call translateItemText() separately.
* Returns { extractedText, translatedText } where translatedText is always null. * Returns { extractedText, translatedText } where translatedText is always null.
*/ */
export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> { export async function extractItemText(itemKey: string, ocrLanguagesOverride?: string): Promise<{ extractedText: string; translatedText: string | null }> {
const libraryId = itemKey.split(':')[0] const libraryId = itemKey.split(':')[0]
const config = getEffectiveAiConfig(libraryId) const config = getEffectiveAiConfig(libraryId)
@@ -567,7 +567,8 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' }) throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' })
} }
const { ocrMode, ocrLanguages, ocrConfidenceThreshold } = config const { ocrMode, ocrLanguages: configOcrLanguages, ocrConfidenceThreshold } = config
const ocrLanguages = ocrLanguagesOverride?.trim() || configOcrLanguages
// ── Tesseract path ──────────────────────────────────────────────────────── // ── Tesseract path ────────────────────────────────────────────────────────
if (ocrMode === 'tesseract' || ocrMode === 'hybrid') { if (ocrMode === 'tesseract' || ocrMode === 'hybrid') {

View File

@@ -338,4 +338,12 @@ function migrateAiJobs(db: Database.Database): void {
CREATE INDEX IF NOT EXISTS ai_jobs_status ON ai_jobs(status); CREATE INDEX IF NOT EXISTS ai_jobs_status ON ai_jobs(status);
CREATE INDEX IF NOT EXISTS ai_jobs_created_at ON ai_jobs(created_at); CREATE INDEX IF NOT EXISTS ai_jobs_created_at ON ai_jobs(created_at);
`) `)
// Add payload column if not present
const aiJobsRow = db
.prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='ai_jobs'")
.get() as { sql: string } | undefined
if (aiJobsRow && !aiJobsRow.sql.includes('payload')) {
db.exec('ALTER TABLE ai_jobs ADD COLUMN payload TEXT')
}
} }