add tesseract ocr
This commit is contained in:
117
package-lock.json
generated
117
package-lock.json
generated
@@ -17,7 +17,8 @@
|
||||
"node-cron": "^4.2.1",
|
||||
"react": "^19.2.4",
|
||||
"react-dom": "^19.2.4",
|
||||
"sharp": "^0.34.5"
|
||||
"sharp": "^0.34.5",
|
||||
"tesseract.js": "^7.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@tailwindcss/postcss": "^4.2.2",
|
||||
@@ -2950,6 +2951,12 @@
|
||||
"readable-stream": "^3.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bmp-js": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz",
|
||||
"integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/brace-expansion": {
|
||||
"version": "1.1.12",
|
||||
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
|
||||
@@ -4803,6 +4810,12 @@
|
||||
"hermes-estree": "0.25.1"
|
||||
}
|
||||
},
|
||||
"node_modules/idb-keyval": {
|
||||
"version": "6.2.2",
|
||||
"resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-6.2.2.tgz",
|
||||
"integrity": "sha512-yjD9nARJ/jb1g+CvD0tlhUHOrJ9Sy0P8T9MF3YaLlHnSRpwPfpTX0XIvpmw3gAJUmEu3FiICLBDPXVwyEvrleg==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/ieee754": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
||||
@@ -5288,6 +5301,12 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/is-url": {
|
||||
"version": "1.2.4",
|
||||
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
|
||||
"integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/is-weakmap": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz",
|
||||
@@ -6167,6 +6186,26 @@
|
||||
"semver": "bin/semver.js"
|
||||
}
|
||||
},
|
||||
"node_modules/node-fetch": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
|
||||
"integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"whatwg-url": "^5.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": "4.x || >=6.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"encoding": "^0.1.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"encoding": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/node-releases": {
|
||||
"version": "2.0.36",
|
||||
"resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.36.tgz",
|
||||
@@ -6315,6 +6354,15 @@
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/opencollective-postinstall": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz",
|
||||
"integrity": "sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==",
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"opencollective-postinstall": "index.js"
|
||||
}
|
||||
},
|
||||
"node_modules/optionator": {
|
||||
"version": "0.9.4",
|
||||
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
|
||||
@@ -6747,6 +6795,12 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/regenerator-runtime": {
|
||||
"version": "0.13.11",
|
||||
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",
|
||||
"integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/regexp.prototype.flags": {
|
||||
"version": "1.5.4",
|
||||
"resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.4.tgz",
|
||||
@@ -7585,6 +7639,30 @@
|
||||
"streamx": "^2.12.5"
|
||||
}
|
||||
},
|
||||
"node_modules/tesseract.js": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-7.0.0.tgz",
|
||||
"integrity": "sha512-exPBkd+z+wM1BuMkx/Bjv43OeLBxhL5kKWsz/9JY+DXcXdiBjiAch0V49QR3oAJqCaL5qURE0vx9Eo+G5YE7mA==",
|
||||
"hasInstallScript": true,
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"bmp-js": "^0.1.0",
|
||||
"idb-keyval": "^6.2.0",
|
||||
"is-url": "^1.2.4",
|
||||
"node-fetch": "^2.6.9",
|
||||
"opencollective-postinstall": "^2.0.3",
|
||||
"regenerator-runtime": "^0.13.3",
|
||||
"tesseract.js-core": "^7.0.0",
|
||||
"wasm-feature-detect": "^1.8.0",
|
||||
"zlibjs": "^0.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/tesseract.js-core": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-7.0.0.tgz",
|
||||
"integrity": "sha512-WnNH518NzmbSq9zgTPeoF8c+xmilS8rFIl1YKbk/ptuuc7p6cLNELNuPAzcmsYw450ca6bLa8j3t0VAtq435Vw==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/text-decoder": {
|
||||
"version": "1.2.7",
|
||||
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.7.tgz",
|
||||
@@ -7655,6 +7733,12 @@
|
||||
"node": ">=8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/tr46": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
|
||||
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/ts-api-utils": {
|
||||
"version": "2.5.0",
|
||||
"resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.5.0.tgz",
|
||||
@@ -7955,6 +8039,28 @@
|
||||
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/wasm-feature-detect": {
|
||||
"version": "1.8.0",
|
||||
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
|
||||
"integrity": "sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/webidl-conversions": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
|
||||
"license": "BSD-2-Clause"
|
||||
},
|
||||
"node_modules/whatwg-url": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
|
||||
"integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tr46": "~0.0.3",
|
||||
"webidl-conversions": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/which": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
|
||||
@@ -8237,6 +8343,15 @@
|
||||
"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/zlibjs": {
|
||||
"version": "0.3.1",
|
||||
"resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
|
||||
"integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/zod": {
|
||||
"version": "4.3.6",
|
||||
"resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz",
|
||||
|
||||
@@ -20,7 +20,8 @@
|
||||
"node-cron": "^4.2.1",
|
||||
"react": "^19.2.4",
|
||||
"react-dom": "^19.2.4",
|
||||
"sharp": "^0.34.5"
|
||||
"sharp": "^0.34.5",
|
||||
"tesseract.js": "^7.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@tailwindcss/postcss": "^4.2.2",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { NextRequest, NextResponse } from 'next/server'
|
||||
import { requireAdmin } from '@/lib/auth'
|
||||
import { getAiConfig, updateAiConfig, getPreferredLanguage, setPreferredLanguage, getAiMaxRetries, setAiMaxRetries } from '@/lib/app-settings'
|
||||
import { getAiConfig, updateAiConfig, getPreferredLanguage, setPreferredLanguage, getAiMaxRetries, setAiMaxRetries, type OcrMode } from '@/lib/app-settings'
|
||||
|
||||
export async function GET(request: NextRequest) {
|
||||
const auth = await requireAdmin(request)
|
||||
@@ -34,6 +34,9 @@ export async function PUT(request: NextRequest) {
|
||||
maxTokensDescribe?: number
|
||||
maxTokensExtract?: number
|
||||
maxTokensTranslate?: number
|
||||
ocrMode?: string
|
||||
ocrLanguages?: string
|
||||
ocrConfidenceThreshold?: number
|
||||
}
|
||||
try {
|
||||
body = await request.json()
|
||||
@@ -47,6 +50,7 @@ export async function PUT(request: NextRequest) {
|
||||
promptDescribe, promptTagger, promptExtract, promptTranslate,
|
||||
maxRetries,
|
||||
maxTokensTag, maxTokensDescribe, maxTokensExtract, maxTokensTranslate,
|
||||
ocrMode, ocrLanguages, ocrConfidenceThreshold,
|
||||
} = body
|
||||
|
||||
if (typeof endpoint !== 'string') {
|
||||
@@ -75,6 +79,9 @@ export async function PUT(request: NextRequest) {
|
||||
typeof maxTokensDescribe === 'number' ? maxTokensDescribe : undefined,
|
||||
typeof maxTokensExtract === 'number' ? maxTokensExtract : undefined,
|
||||
typeof maxTokensTranslate === 'number' ? maxTokensTranslate : undefined,
|
||||
(ocrMode === 'hybrid' || ocrMode === 'tesseract' || ocrMode === 'llm') ? (ocrMode as OcrMode) : undefined,
|
||||
typeof ocrLanguages === 'string' ? ocrLanguages : undefined,
|
||||
typeof ocrConfidenceThreshold === 'number' ? ocrConfidenceThreshold : undefined,
|
||||
)
|
||||
|
||||
if (typeof preferredLanguage === 'string' && preferredLanguage.trim()) {
|
||||
|
||||
@@ -20,6 +20,9 @@ interface AiSettings {
|
||||
maxTokensDescribe: number
|
||||
maxTokensExtract: number
|
||||
maxTokensTranslate: number
|
||||
ocrMode: 'hybrid' | 'tesseract' | 'llm'
|
||||
ocrLanguages: string
|
||||
ocrConfidenceThreshold: number
|
||||
}
|
||||
|
||||
interface AiJob {
|
||||
@@ -76,6 +79,7 @@ export default function AiTaggingPage() {
|
||||
promptDescribe: '', promptTagger: '', promptExtract: '', promptTranslate: '',
|
||||
maxRetries: 3,
|
||||
maxTokensTag: 8192, maxTokensDescribe: 8192, maxTokensExtract: 8192, maxTokensTranslate: 8192,
|
||||
ocrMode: 'hybrid', ocrLanguages: 'eng', ocrConfidenceThreshold: 70,
|
||||
})
|
||||
const [loading, setLoading] = useState(true)
|
||||
const [saving, setSaving] = useState(false)
|
||||
@@ -644,6 +648,72 @@ export default function AiTaggingPage() {
|
||||
/>
|
||||
</Field>
|
||||
|
||||
<Field label="OCR Mode">
|
||||
<div className="flex gap-2">
|
||||
{(['hybrid', 'tesseract', 'llm'] as const).map((mode) => (
|
||||
<button
|
||||
key={mode}
|
||||
type="button"
|
||||
onClick={() => setSettings((s) => ({ ...s, ocrMode: mode }))}
|
||||
className="px-3 py-1.5 rounded-lg text-sm transition-colors"
|
||||
style={{
|
||||
backgroundColor: settings.ocrMode === mode ? 'var(--accent)' : 'var(--surface)',
|
||||
color: settings.ocrMode === mode ? '#fff' : 'var(--text-secondary)',
|
||||
border: '1px solid var(--border)',
|
||||
}}
|
||||
>
|
||||
{mode === 'hybrid' ? 'Hybrid' : mode === 'tesseract' ? 'Tesseract only' : 'LLM only'}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
<p className="mt-1 text-xs" style={{ color: 'var(--text-secondary)' }}>
|
||||
Hybrid runs local OCR first and falls back to the LLM when confidence is low. Tesseract only never calls the LLM. LLM only uses the original behaviour.
|
||||
</p>
|
||||
</Field>
|
||||
|
||||
<Field label="OCR Languages">
|
||||
<input
|
||||
type="text"
|
||||
value={settings.ocrLanguages}
|
||||
onChange={(e) => setSettings((s) => ({ ...s, ocrLanguages: e.target.value }))}
|
||||
placeholder="eng"
|
||||
className="w-full rounded-lg px-3 py-2 text-sm font-mono outline-none focus:ring-2"
|
||||
style={{
|
||||
backgroundColor: 'var(--background)',
|
||||
border: '1px solid var(--border)',
|
||||
color: 'var(--text-primary)',
|
||||
}}
|
||||
onFocus={(e) => ((e.currentTarget as HTMLElement).style.borderColor = 'var(--accent)')}
|
||||
onBlur={(e) => ((e.currentTarget as HTMLElement).style.borderColor = 'var(--border)')}
|
||||
/>
|
||||
<p className="mt-1 text-xs" style={{ color: 'var(--text-secondary)' }}>
|
||||
{`Tesseract language packs to use, joined with '+'. For Japanese manga use jpn+jpn_vert. Language data is downloaded automatically on first use.`}
|
||||
</p>
|
||||
</Field>
|
||||
|
||||
<Field label="OCR Confidence Threshold">
|
||||
<input
|
||||
type="number"
|
||||
min={0}
|
||||
max={100}
|
||||
value={settings.ocrConfidenceThreshold}
|
||||
onChange={(e) =>
|
||||
setSettings((s) => ({ ...s, ocrConfidenceThreshold: Math.max(0, Math.min(100, parseInt(e.target.value) || 70)) }))
|
||||
}
|
||||
className="w-24 rounded-lg px-3 py-2 text-sm outline-none focus:ring-2"
|
||||
style={{
|
||||
backgroundColor: 'var(--background)',
|
||||
border: '1px solid var(--border)',
|
||||
color: 'var(--text-primary)',
|
||||
}}
|
||||
onFocus={(e) => ((e.currentTarget as HTMLElement).style.borderColor = 'var(--accent)')}
|
||||
onBlur={(e) => ((e.currentTarget as HTMLElement).style.borderColor = 'var(--border)')}
|
||||
/>
|
||||
<p className="mt-1 text-xs" style={{ color: 'var(--text-secondary)' }}>
|
||||
In hybrid mode, Tesseract results below this confidence score (0–100) fall back to the LLM. Default is 70.
|
||||
</p>
|
||||
</Field>
|
||||
|
||||
<Field label="Translation Model">
|
||||
<input
|
||||
type="text"
|
||||
|
||||
@@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types'
|
||||
import { getDb } from './db'
|
||||
import { getAiConfig, getEffectiveAiConfig, getPreferredLanguage } from './app-settings'
|
||||
import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
|
||||
import { getAiImagePath, getVideoFramePaths } from './thumbnails'
|
||||
import { getAiImagePath, getOcrImagePath, getVideoFramePaths } from './thumbnails'
|
||||
import { findFile } from './media-utils'
|
||||
import { getLibrary, resolveLibraryRoot } from './libraries'
|
||||
|
||||
@@ -509,7 +509,31 @@ export async function generateItemDescription(itemKey: string): Promise<string>
|
||||
// ─── Text extraction ─────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Extract text (OCR) from an image using the vision model.
|
||||
* Run Tesseract OCR on a preprocessed image file.
|
||||
* Returns the extracted text and a mean confidence score (0–100).
|
||||
* A confidence of 0 with empty text means no recognisable text was found.
|
||||
*/
|
||||
async function extractWithTesseract(
|
||||
imagePath: string,
|
||||
languages: string,
|
||||
): Promise<{ text: string; confidence: number }> {
|
||||
const { createWorker } = await import('tesseract.js')
|
||||
const workerPath = path.join(process.cwd(), 'node_modules/tesseract.js/src/worker-script/node/index.js')
|
||||
const worker = await createWorker(languages, 1, { workerPath })
|
||||
try {
|
||||
const { data } = await worker.recognize(imagePath)
|
||||
return { text: data.text.trim(), confidence: data.confidence }
|
||||
} finally {
|
||||
await worker.terminate()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text (OCR) from an image using the configured OCR mode:
|
||||
* - hybrid: try Tesseract first; fall back to LLM if confidence is below threshold
|
||||
* - tesseract: local Tesseract only, no LLM call
|
||||
* - llm: LLM vision API only (original behaviour)
|
||||
*
|
||||
* Only works for images in mixed libraries.
|
||||
* Translation is not performed automatically — call translateItemText() separately.
|
||||
* Returns { extractedText, translatedText } where translatedText is always null.
|
||||
@@ -517,10 +541,6 @@ export async function generateItemDescription(itemKey: string): Promise<string>
|
||||
export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> {
|
||||
const libraryId = itemKey.split(':')[0]
|
||||
const config = getEffectiveAiConfig(libraryId)
|
||||
const extractModel = config.modelExtract || config.model
|
||||
if (!config.endpoint || !extractModel) {
|
||||
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
|
||||
}
|
||||
|
||||
const db = getDb()
|
||||
const item = db
|
||||
@@ -547,12 +567,39 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
|
||||
throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' })
|
||||
}
|
||||
|
||||
const { ocrMode, ocrLanguages, ocrConfidenceThreshold } = config
|
||||
|
||||
// ── Tesseract path ────────────────────────────────────────────────────────
|
||||
if (ocrMode === 'tesseract' || ocrMode === 'hybrid') {
|
||||
const ocrImagePath = await getOcrImagePath(resolvedMedia.path, libraryId)
|
||||
const { text, confidence } = await extractWithTesseract(ocrImagePath, ocrLanguages)
|
||||
|
||||
const useTesseractResult = ocrMode === 'tesseract' || confidence >= ocrConfidenceThreshold
|
||||
if (useTesseractResult) {
|
||||
console.log(`[ocr] tesseract used for ${itemKey} (confidence=${confidence}, mode=${ocrMode})`)
|
||||
if (!text) {
|
||||
db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey)
|
||||
return { extractedText: '', translatedText: null }
|
||||
}
|
||||
db.prepare('UPDATE media_items SET extracted_text = ?, extracted_text_translated = NULL WHERE item_key = ?').run(text, itemKey)
|
||||
return { extractedText: text, translatedText: null }
|
||||
}
|
||||
console.log(`[ocr] tesseract confidence too low (${confidence} < ${ocrConfidenceThreshold}), falling back to LLM for ${itemKey}`)
|
||||
}
|
||||
|
||||
// ── LLM vision path ───────────────────────────────────────────────────────
|
||||
const extractModel = config.modelExtract || config.model
|
||||
if (!config.endpoint || !extractModel) {
|
||||
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
|
||||
}
|
||||
|
||||
const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
|
||||
const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
|
||||
|
||||
const customInstruction = config.promptExtract ? ' ' + config.promptExtract : ''
|
||||
const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]`
|
||||
|
||||
console.log(`[ocr] llm used for ${itemKey} (mode=${ocrMode})`)
|
||||
const extractedText = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt, config.maxTokensExtract)
|
||||
|
||||
if (!extractedText || extractedText === '[NO TEXT]') {
|
||||
|
||||
@@ -46,6 +46,8 @@ const DEFAULT_PROMPT_EXTRACT =
|
||||
'Be mindful of different colors of text that may indicate different speakers or emphasis.'
|
||||
const DEFAULT_PROMPT_TRANSLATE = 'Return ONLY the translated text with no additional commentary.'
|
||||
|
||||
export type OcrMode = 'hybrid' | 'tesseract' | 'llm'
|
||||
|
||||
export interface AiConfig {
|
||||
endpoint: string
|
||||
model: string
|
||||
@@ -62,6 +64,9 @@ export interface AiConfig {
|
||||
maxTokensDescribe: number
|
||||
maxTokensExtract: number
|
||||
maxTokensTranslate: number
|
||||
ocrMode: OcrMode
|
||||
ocrLanguages: string
|
||||
ocrConfidenceThreshold: number
|
||||
}
|
||||
|
||||
export function getAiConfig(): AiConfig {
|
||||
@@ -84,10 +89,15 @@ export function getAiConfig(): AiConfig {
|
||||
const maxTokensDescribe = parseInt(getSetting('ai_max_tokens_describe') ?? '8192', 10) || 8192
|
||||
const maxTokensExtract = parseInt(getSetting('ai_max_tokens_extract') ?? '8192', 10) || 8192
|
||||
const maxTokensTranslate = parseInt(getSetting('ai_max_tokens_translate') ?? '8192', 10) || 8192
|
||||
const rawOcrMode = getSetting('ai_ocr_mode') ?? 'hybrid'
|
||||
const ocrMode: OcrMode = rawOcrMode === 'tesseract' || rawOcrMode === 'llm' ? rawOcrMode : 'hybrid'
|
||||
const ocrLanguages = getSetting('ai_ocr_languages') ?? 'eng'
|
||||
const ocrConfidenceThreshold = parseInt(getSetting('ai_ocr_confidence_threshold') ?? '70', 10) || 70
|
||||
return {
|
||||
endpoint, model, modelTagging, modelDescribe, modelExtract, modelTranslate, enabled,
|
||||
promptDescribe, promptTagger, promptExtract, promptTranslate,
|
||||
maxTokensTag, maxTokensDescribe, maxTokensExtract, maxTokensTranslate,
|
||||
ocrMode, ocrLanguages, ocrConfidenceThreshold,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,6 +117,9 @@ export function updateAiConfig(
|
||||
maxTokensDescribe?: number,
|
||||
maxTokensExtract?: number,
|
||||
maxTokensTranslate?: number,
|
||||
ocrMode?: OcrMode,
|
||||
ocrLanguages?: string,
|
||||
ocrConfidenceThreshold?: number,
|
||||
): void {
|
||||
setSetting('ai_endpoint', endpoint)
|
||||
setSetting('ai_model', model)
|
||||
@@ -123,6 +136,9 @@ export function updateAiConfig(
|
||||
if (maxTokensDescribe !== undefined) setSetting('ai_max_tokens_describe', String(Math.max(1, Math.floor(maxTokensDescribe))))
|
||||
if (maxTokensExtract !== undefined) setSetting('ai_max_tokens_extract', String(Math.max(1, Math.floor(maxTokensExtract))))
|
||||
if (maxTokensTranslate !== undefined) setSetting('ai_max_tokens_translate', String(Math.max(1, Math.floor(maxTokensTranslate))))
|
||||
if (ocrMode !== undefined) setSetting('ai_ocr_mode', ocrMode)
|
||||
if (ocrLanguages !== undefined) setSetting('ai_ocr_languages', ocrLanguages.trim() || 'eng')
|
||||
if (ocrConfidenceThreshold !== undefined) setSetting('ai_ocr_confidence_threshold', String(Math.max(0, Math.min(100, Math.floor(ocrConfidenceThreshold)))))
|
||||
}
|
||||
|
||||
export function getPreferredLanguage(): string {
|
||||
@@ -249,6 +265,9 @@ export function getEffectiveAiConfig(libraryId: string): AiConfig {
|
||||
maxTokensDescribe: overrides.maxTokensDescribe ?? global.maxTokensDescribe,
|
||||
maxTokensExtract: overrides.maxTokensExtract ?? global.maxTokensExtract,
|
||||
maxTokensTranslate: overrides.maxTokensTranslate ?? global.maxTokensTranslate,
|
||||
ocrMode: global.ocrMode,
|
||||
ocrLanguages: global.ocrLanguages,
|
||||
ocrConfidenceThreshold: global.ocrConfidenceThreshold,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -60,6 +60,19 @@ async function generateAiImage(src: string, dest: string): Promise<void> {
|
||||
fs.renameSync(tmp, dest)
|
||||
}
|
||||
|
||||
/** Generate a grayscale, contrast-normalised PNG for local OCR (Tesseract).
|
||||
* PNG is lossless and avoids JPEG artefacts that can degrade OCR accuracy. */
|
||||
async function generateOcrImage(src: string, dest: string): Promise<void> {
|
||||
const tmp = dest + '.tmp'
|
||||
await sharp(src)
|
||||
.resize(AI_IMAGE_WIDTH, undefined, { withoutEnlargement: true })
|
||||
.grayscale()
|
||||
.normalise()
|
||||
.png()
|
||||
.toFile(tmp)
|
||||
fs.renameSync(tmp, dest)
|
||||
}
|
||||
|
||||
/** Run a child process and collect stderr. Resolves on exit code 0, rejects otherwise. */
|
||||
function run(bin: string, args: string[]): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
@@ -190,6 +203,24 @@ export async function getAiImagePath(
|
||||
return cacheFile
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the absolute path to a preprocessed PNG suitable for local OCR.
|
||||
* The image is converted to grayscale and contrast-normalised for better
|
||||
* Tesseract accuracy. Cached with an `_ocr` suffix.
|
||||
*/
|
||||
export async function getOcrImagePath(
|
||||
absoluteFilePath: string,
|
||||
libraryId: string
|
||||
): Promise<string> {
|
||||
ensureCacheDir()
|
||||
const key = cacheKey(libraryId, absoluteFilePath)
|
||||
const cacheFile = path.join(CACHE_DIR, key + '_ocr.png')
|
||||
const cached = getCachedPath(cacheFile, absoluteFilePath)
|
||||
if (cached) return cached
|
||||
await generateOcrImage(absoluteFilePath, cacheFile)
|
||||
return cacheFile
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the absolute path to a cached thumbnail JPEG for the given file.
|
||||
* Generates it on first call (or when the source has been modified).
|
||||
|
||||
Reference in New Issue
Block a user