From db2e446ef499ca035320ffa2b418145aed0734f6 Mon Sep 17 00:00:00 2001 From: Garret Patti <42485635+garretpatti@users.noreply.github.com> Date: Mon, 13 Apr 2026 21:55:07 -0400 Subject: [PATCH] feat: per-extraction OCR language override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow users to specify a Tesseract language string (e.g. jpn+jpn_vert) on a per-extraction basis, overriding the global OCR language setting. - Add payload column to ai_jobs table (migration) to carry per-call data - Thread ocrLanguages payload through enqueueJob → processNextJob → extractItemText - New GET /api/ai-settings/ocr endpoint (requireAuth) returns { ocrMode, ocrLanguages } - ImageLightbox fetches OCR settings and shows a language input next to the Extract Text button when mode is hybrid or tesseract (hidden for llm-only) - MixedView fetches OCR settings and passes them down to EntryTile; kebab Extract Text on images shows an inline language prompt before dispatching the job Co-Authored-By: Claude Sonnet 4.6 --- src/app/api/ai-settings/ocr/route.ts | 11 ++ src/app/api/ai-tagging/extract-text/route.ts | 12 +- src/components/DoomScrollView.tsx | 2 +- src/components/mixed/ImageLightbox.tsx | 133 ++++++++++++------- src/components/mixed/MixedView.tsx | 93 +++++++++++-- src/lib/ai-jobs.ts | 12 +- src/lib/ai-tagger.ts | 5 +- src/lib/db.ts | 8 ++ 8 files changed, 206 insertions(+), 70 deletions(-) create mode 100644 src/app/api/ai-settings/ocr/route.ts diff --git a/src/app/api/ai-settings/ocr/route.ts b/src/app/api/ai-settings/ocr/route.ts new file mode 100644 index 0000000..2ccf3ce --- /dev/null +++ b/src/app/api/ai-settings/ocr/route.ts @@ -0,0 +1,11 @@ +import { NextRequest, NextResponse } from 'next/server' +import { requireAuth } from '@/lib/auth' +import { getAiConfig } from '@/lib/app-settings' + +export async function GET(request: NextRequest) { + const auth = await requireAuth(request) + if (auth instanceof NextResponse) return auth + + const { ocrMode, ocrLanguages } = getAiConfig() + return NextResponse.json({ ocrMode, ocrLanguages }) +} diff --git a/src/app/api/ai-tagging/extract-text/route.ts b/src/app/api/ai-tagging/extract-text/route.ts index 5b6ad22..b213555 100644 --- a/src/app/api/ai-tagging/extract-text/route.ts +++ b/src/app/api/ai-tagging/extract-text/route.ts @@ -3,14 +3,14 @@ import { requireLibraryAccess } from '@/lib/auth' import { enqueueJob } from '@/lib/ai-jobs' export async function POST(request: NextRequest) { - let body: { itemKey?: string } + let body: { itemKey?: string; ocrLanguages?: string } try { body = await request.json() } catch { return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 }) } - const { itemKey } = body + const { itemKey, ocrLanguages } = body if (!itemKey || typeof itemKey !== 'string') { return NextResponse.json({ error: 'itemKey is required' }, { status: 400 }) } @@ -19,6 +19,12 @@ export async function POST(request: NextRequest) { const auth = await requireLibraryAccess(request, libraryId) if (auth instanceof NextResponse) return auth - const jobId = enqueueJob(itemKey, 'extract', libraryId) + const jobId = enqueueJob( + itemKey, + 'extract', + libraryId, + undefined, + ocrLanguages ? { ocrLanguages } : undefined, + ) return NextResponse.json({ jobId }, { status: 202 }) } diff --git a/src/components/DoomScrollView.tsx b/src/components/DoomScrollView.tsx index daa5f96..d750160 100644 --- a/src/components/DoomScrollView.tsx +++ b/src/components/DoomScrollView.tsx @@ -336,7 +336,7 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose, {/* Text overlay */} {showTextOverlay && displayText && (
e.stopPropagation()} > diff --git a/src/components/mixed/ImageLightbox.tsx b/src/components/mixed/ImageLightbox.tsx index baa3742..d059616 100644 --- a/src/components/mixed/ImageLightbox.tsx +++ b/src/components/mixed/ImageLightbox.tsx @@ -39,6 +39,11 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item const [descPending, setDescPending] = useState(false) const [descError, setDescError] = useState(null) + // OCR settings + const [ocrMode, setOcrMode] = useState(null) + const [defaultOcrLanguages, setDefaultOcrLanguages] = useState('eng') + const [ocrLanguageInput, setOcrLanguageInput] = useState('') + // Text overlay state const [showTextOverlay, setShowTextOverlay] = useState(false) const [showOriginal, setShowOriginal] = useState(false) @@ -68,6 +73,13 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item useEffect(() => { fetchAiFields() + fetch('/api/ai-settings/ocr') + .then((r) => r.json()) + .then((d: { ocrMode: string; ocrLanguages: string }) => { + setOcrMode(d.ocrMode) + setDefaultOcrLanguages(d.ocrLanguages) + }) + .catch(() => {}) return () => { if (pollRef.current) clearInterval(pollRef.current) } @@ -439,58 +451,79 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item Text Extraction

- + }} + onMouseLeave={(e) => { + if (!extractPending) { + ;(e.currentTarget as HTMLElement).style.backgroundColor = 'var(--border)' + ;(e.currentTarget as HTMLElement).style.color = 'var(--text-secondary)' + } + }} + > + {extracting ? '⟳ Extracting…' : extractPending ? '⟳ Queued…' : extractedText ? '🔍 Re-extract Text' : '🔍 Extract Text'} + + {ocrMode && ocrMode !== 'llm' && ( + setOcrLanguageInput(e.target.value)} + placeholder={defaultOcrLanguages} + className="text-xs px-2 py-0.5 rounded-full outline-none" + style={{ + backgroundColor: 'var(--background)', + border: '1px solid var(--border)', + color: 'var(--text-primary)', + width: 120, + }} + title="Tesseract language(s) for this extraction (e.g. jpn+jpn_vert). Leave blank to use the configured default." + /> + )} +
{extractError && (

{extractError}

diff --git a/src/components/mixed/MixedView.tsx b/src/components/mixed/MixedView.tsx index e51735d..ebd856d 100644 --- a/src/components/mixed/MixedView.tsx +++ b/src/components/mixed/MixedView.tsx @@ -83,6 +83,9 @@ export default function MixedView({ libraryId, initialPath }: Props) { setDoomScrollLoading(false) }, [currentPath]) + const [ocrMode, setOcrMode] = useState(null) + const [defaultOcrLanguages, setDefaultOcrLanguages] = useState('eng') + const fetchAssignments = useCallback(() => { fetch(`/api/tags/library-assignments?libraryId=${encodeURIComponent(libraryId)}`) .then((r) => r.json()) @@ -92,6 +95,16 @@ export default function MixedView({ libraryId, initialPath }: Props) { useEffect(() => { fetchAssignments() }, [fetchAssignments]) + useEffect(() => { + fetch('/api/ai-settings/ocr') + .then((r) => r.json()) + .then((d: { ocrMode: string; ocrLanguages: string }) => { + setOcrMode(d.ocrMode) + setDefaultOcrLanguages(d.ocrLanguages) + }) + .catch(() => {}) + }, []) + const filtersActive = search !== '' || selectedTagIds.size > 0 const fetchRecursive = useCallback(() => { @@ -387,6 +400,8 @@ export default function MixedView({ libraryId, initialPath }: Props) { entry={entry} onOpen={handleEntry} onTag={handleTagEntry} + ocrMode={ocrMode} + defaultOcrLanguages={defaultOcrLanguages} onAiTag={async (e) => { const itemKey = itemKeyFor(e) const res = await fetch('/api/ai-tagging', { @@ -401,7 +416,7 @@ export default function MixedView({ libraryId, initialPath }: Props) { fetchAssignments() setFilterRefreshKey((k) => k + 1) }} - onExtractText={async (e) => { + onExtractText={async (e, ocrLanguages) => { if (e.type === 'directory') { // Bulk extract for directory const dirRel = filtersActive ? e.name : (currentPath ? `${currentPath}/${e.name}` : e.name) @@ -420,7 +435,7 @@ export default function MixedView({ libraryId, initialPath }: Props) { const res = await fetch('/api/ai-tagging/extract-text', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ itemKey }), + body: JSON.stringify({ itemKey, ...(ocrLanguages && { ocrLanguages }) }), }) if (!res.ok) { const data = await res.json().catch(() => ({})) @@ -594,7 +609,7 @@ export default function MixedView({ libraryId, initialPath }: Props) { ) } -function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe, onTranslate }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise; onAiTag?: (e: FileEntry) => Promise; onExtractText?: (e: FileEntry) => Promise; onDescribe?: (e: FileEntry) => Promise; onTranslate?: (e: FileEntry) => Promise }) { +function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtractText, onDescribe, onTranslate, ocrMode, defaultOcrLanguages }: { entry: FileEntry; onOpen: (e: FileEntry) => void; onTag: (e: FileEntry) => void; onDelete?: (e: FileEntry) => void; onRename?: (e: FileEntry, newName: string) => Promise; onAiTag?: (e: FileEntry) => Promise; onExtractText?: (e: FileEntry, ocrLanguages?: string) => Promise; onDescribe?: (e: FileEntry) => Promise; onTranslate?: (e: FileEntry) => Promise; ocrMode?: string | null; defaultOcrLanguages?: string }) { type ImgState = 'loading' | 'loaded' | 'error' const [imgState, setImgState] = useState( entry.thumbnailUrl ? 'loading' : 'error' @@ -615,6 +630,8 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac const [describeError, setDescribeError] = useState(null) const [translating, setTranslating] = useState(false) const [translateError, setTranslateError] = useState(null) + const [showOcrPrompt, setShowOcrPrompt] = useState(false) + const [ocrLanguageInput, setOcrLanguageInput] = useState('') useEffect(() => { if (!menuOpen) return @@ -804,16 +821,21 @@ function EntryTile({ entry, onOpen, onTag, onDelete, onRename, onAiTag, onExtrac 📝 Describe Folder )} - {onExtractText && entry.mediaType === 'image' && ( + {onExtractText && entry.mediaType === 'image' && !showOcrPrompt && ( )} + {onExtractText && entry.mediaType === 'image' && showOcrPrompt && ( +
e.stopPropagation()}> +

OCR language

+ setOcrLanguageInput(e.target.value)} + onKeyDown={(e) => { + if (e.key === 'Escape') { setShowOcrPrompt(false) } + if (e.key === 'Enter') { + setShowOcrPrompt(false) + setMenuOpen(false) + setTextExtracting(true) + setTextExtractError(null) + onExtractText(entry, ocrLanguageInput.trim() || undefined) + .catch((err) => setTextExtractError(err instanceof Error ? err.message : 'Text extraction failed')) + .finally(() => setTextExtracting(false)) + } + }} + placeholder={defaultOcrLanguages ?? 'eng'} + className="text-xs px-2 py-1 rounded-lg outline-none w-full" + style={{ backgroundColor: 'var(--background)', border: '1px solid var(--border)', color: 'var(--text-primary)' }} + title="Tesseract language(s) for this extraction (e.g. jpn+jpn_vert). Leave blank to use the configured default." + /> +
+ + +
+
+ )} {onExtractText && entry.type === 'directory' && (