add tesseract ocr

This commit is contained in:
Garret Patti
2026-04-13 19:40:25 -04:00
parent 1350a6f94b
commit 9b2690f639
7 changed files with 299 additions and 9 deletions

View File

@@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types'
import { getDb } from './db'
import { getAiConfig, getEffectiveAiConfig, getPreferredLanguage } from './app-settings'
import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
import { getAiImagePath, getVideoFramePaths } from './thumbnails'
import { getAiImagePath, getOcrImagePath, getVideoFramePaths } from './thumbnails'
import { findFile } from './media-utils'
import { getLibrary, resolveLibraryRoot } from './libraries'
@@ -509,7 +509,31 @@ export async function generateItemDescription(itemKey: string): Promise<string>
// ─── Text extraction ─────────────────────────────────────────────────────────
/**
* Extract text (OCR) from an image using the vision model.
* Run Tesseract OCR on a preprocessed image file.
* Returns the extracted text and a mean confidence score (0100).
* A confidence of 0 with empty text means no recognisable text was found.
*/
async function extractWithTesseract(
imagePath: string,
languages: string,
): Promise<{ text: string; confidence: number }> {
const { createWorker } = await import('tesseract.js')
const workerPath = path.join(process.cwd(), 'node_modules/tesseract.js/src/worker-script/node/index.js')
const worker = await createWorker(languages, 1, { workerPath })
try {
const { data } = await worker.recognize(imagePath)
return { text: data.text.trim(), confidence: data.confidence }
} finally {
await worker.terminate()
}
}
/**
* Extract text (OCR) from an image using the configured OCR mode:
* - hybrid: try Tesseract first; fall back to LLM if confidence is below threshold
* - tesseract: local Tesseract only, no LLM call
* - llm: LLM vision API only (original behaviour)
*
* Only works for images in mixed libraries.
* Translation is not performed automatically — call translateItemText() separately.
* Returns { extractedText, translatedText } where translatedText is always null.
@@ -517,10 +541,6 @@ export async function generateItemDescription(itemKey: string): Promise<string>
export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> {
const libraryId = itemKey.split(':')[0]
const config = getEffectiveAiConfig(libraryId)
const extractModel = config.modelExtract || config.model
if (!config.endpoint || !extractModel) {
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
}
const db = getDb()
const item = db
@@ -547,12 +567,39 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' })
}
const { ocrMode, ocrLanguages, ocrConfidenceThreshold } = config
// ── Tesseract path ────────────────────────────────────────────────────────
if (ocrMode === 'tesseract' || ocrMode === 'hybrid') {
const ocrImagePath = await getOcrImagePath(resolvedMedia.path, libraryId)
const { text, confidence } = await extractWithTesseract(ocrImagePath, ocrLanguages)
const useTesseractResult = ocrMode === 'tesseract' || confidence >= ocrConfidenceThreshold
if (useTesseractResult) {
console.log(`[ocr] tesseract used for ${itemKey} (confidence=${confidence}, mode=${ocrMode})`)
if (!text) {
db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey)
return { extractedText: '', translatedText: null }
}
db.prepare('UPDATE media_items SET extracted_text = ?, extracted_text_translated = NULL WHERE item_key = ?').run(text, itemKey)
return { extractedText: text, translatedText: null }
}
console.log(`[ocr] tesseract confidence too low (${confidence} < ${ocrConfidenceThreshold}), falling back to LLM for ${itemKey}`)
}
// ── LLM vision path ───────────────────────────────────────────────────────
const extractModel = config.modelExtract || config.model
if (!config.endpoint || !extractModel) {
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
}
const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
const customInstruction = config.promptExtract ? ' ' + config.promptExtract : ''
const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${customInstruction} If there is no text in the image, respond with exactly: [NO TEXT]`
console.log(`[ocr] llm used for ${itemKey} (mode=${ocrMode})`)
const extractedText = await callVisionApiText(config.endpoint, extractModel, base64Images, systemPrompt, config.maxTokensExtract)
if (!extractedText || extractedText === '[NO TEXT]') {

View File

@@ -46,6 +46,8 @@ const DEFAULT_PROMPT_EXTRACT =
'Be mindful of different colors of text that may indicate different speakers or emphasis.'
const DEFAULT_PROMPT_TRANSLATE = 'Return ONLY the translated text with no additional commentary.'
export type OcrMode = 'hybrid' | 'tesseract' | 'llm'
export interface AiConfig {
endpoint: string
model: string
@@ -62,6 +64,9 @@ export interface AiConfig {
maxTokensDescribe: number
maxTokensExtract: number
maxTokensTranslate: number
ocrMode: OcrMode
ocrLanguages: string
ocrConfidenceThreshold: number
}
export function getAiConfig(): AiConfig {
@@ -84,10 +89,15 @@ export function getAiConfig(): AiConfig {
const maxTokensDescribe = parseInt(getSetting('ai_max_tokens_describe') ?? '8192', 10) || 8192
const maxTokensExtract = parseInt(getSetting('ai_max_tokens_extract') ?? '8192', 10) || 8192
const maxTokensTranslate = parseInt(getSetting('ai_max_tokens_translate') ?? '8192', 10) || 8192
const rawOcrMode = getSetting('ai_ocr_mode') ?? 'hybrid'
const ocrMode: OcrMode = rawOcrMode === 'tesseract' || rawOcrMode === 'llm' ? rawOcrMode : 'hybrid'
const ocrLanguages = getSetting('ai_ocr_languages') ?? 'eng'
const ocrConfidenceThreshold = parseInt(getSetting('ai_ocr_confidence_threshold') ?? '70', 10) || 70
return {
endpoint, model, modelTagging, modelDescribe, modelExtract, modelTranslate, enabled,
promptDescribe, promptTagger, promptExtract, promptTranslate,
maxTokensTag, maxTokensDescribe, maxTokensExtract, maxTokensTranslate,
ocrMode, ocrLanguages, ocrConfidenceThreshold,
}
}
@@ -107,6 +117,9 @@ export function updateAiConfig(
maxTokensDescribe?: number,
maxTokensExtract?: number,
maxTokensTranslate?: number,
ocrMode?: OcrMode,
ocrLanguages?: string,
ocrConfidenceThreshold?: number,
): void {
setSetting('ai_endpoint', endpoint)
setSetting('ai_model', model)
@@ -123,6 +136,9 @@ export function updateAiConfig(
if (maxTokensDescribe !== undefined) setSetting('ai_max_tokens_describe', String(Math.max(1, Math.floor(maxTokensDescribe))))
if (maxTokensExtract !== undefined) setSetting('ai_max_tokens_extract', String(Math.max(1, Math.floor(maxTokensExtract))))
if (maxTokensTranslate !== undefined) setSetting('ai_max_tokens_translate', String(Math.max(1, Math.floor(maxTokensTranslate))))
if (ocrMode !== undefined) setSetting('ai_ocr_mode', ocrMode)
if (ocrLanguages !== undefined) setSetting('ai_ocr_languages', ocrLanguages.trim() || 'eng')
if (ocrConfidenceThreshold !== undefined) setSetting('ai_ocr_confidence_threshold', String(Math.max(0, Math.min(100, Math.floor(ocrConfidenceThreshold)))))
}
export function getPreferredLanguage(): string {
@@ -249,6 +265,9 @@ export function getEffectiveAiConfig(libraryId: string): AiConfig {
maxTokensDescribe: overrides.maxTokensDescribe ?? global.maxTokensDescribe,
maxTokensExtract: overrides.maxTokensExtract ?? global.maxTokensExtract,
maxTokensTranslate: overrides.maxTokensTranslate ?? global.maxTokensTranslate,
ocrMode: global.ocrMode,
ocrLanguages: global.ocrLanguages,
ocrConfidenceThreshold: global.ocrConfidenceThreshold,
}
}

View File

@@ -60,6 +60,19 @@ async function generateAiImage(src: string, dest: string): Promise<void> {
fs.renameSync(tmp, dest)
}
/** Generate a grayscale, contrast-normalised PNG for local OCR (Tesseract).
* PNG is lossless and avoids JPEG artefacts that can degrade OCR accuracy. */
async function generateOcrImage(src: string, dest: string): Promise<void> {
const tmp = dest + '.tmp'
await sharp(src)
.resize(AI_IMAGE_WIDTH, undefined, { withoutEnlargement: true })
.grayscale()
.normalise()
.png()
.toFile(tmp)
fs.renameSync(tmp, dest)
}
/** Run a child process and collect stderr. Resolves on exit code 0, rejects otherwise. */
function run(bin: string, args: string[]): Promise<void> {
return new Promise((resolve, reject) => {
@@ -190,6 +203,24 @@ export async function getAiImagePath(
return cacheFile
}
/**
* Returns the absolute path to a preprocessed PNG suitable for local OCR.
* The image is converted to grayscale and contrast-normalised for better
* Tesseract accuracy. Cached with an `_ocr` suffix.
*/
export async function getOcrImagePath(
absoluteFilePath: string,
libraryId: string
): Promise<string> {
ensureCacheDir()
const key = cacheKey(libraryId, absoluteFilePath)
const cacheFile = path.join(CACHE_DIR, key + '_ocr.png')
const cached = getCachedPath(cacheFile, absoluteFilePath)
if (cached) return cached
await generateOcrImage(absoluteFilePath, cacheFile)
return cacheFile
}
/**
* Returns the absolute path to a cached thumbnail JPEG for the given file.
* Generates it on first call (or when the source has been modified).