add ai descriptions and extracted text

This commit is contained in:
Garret Patti
2026-04-12 18:18:59 -04:00
parent 60790a3af1
commit 7e284383b4
13 changed files with 879 additions and 11 deletions

View File

@@ -2,7 +2,7 @@ import fs from 'fs'
import path from 'path'
import type { Library, Tag, TagCategory } from '@/types'
import { getDb } from './db'
import { getAiConfig } from './app-settings'
import { getAiConfig, getPreferredLanguage } from './app-settings'
import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
import { getThumbnailPath, getVideoFramePaths } from './thumbnails'
import { findFile } from './media-utils'
@@ -351,3 +351,343 @@ export async function tagSingleItem(itemKey: string): Promise<string[]> {
return validIds
}
// ─── Vision / Chat text helpers ──────────────────────────────────────────────
/**
* Call the vision API and return raw text content (no JSON parsing).
*/
async function callVisionApiText(
endpoint: string,
model: string,
base64Images: string[],
systemPrompt: string
): Promise<string> {
const url = endpoint.replace(/\/+$/, '') + '/chat/completions'
const controller = new AbortController()
const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS)
try {
const res = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
signal: controller.signal,
body: JSON.stringify({
model,
messages: [
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: base64Images.map((b64) => ({
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${b64}` },
})),
},
],
max_tokens: 8192,
temperature: 0.1,
}),
})
if (!res.ok) {
const text = await res.text().catch(() => '')
throw new Error(`LLM API returned ${res.status}: ${text.slice(0, 200)}`)
}
const data = await res.json() as {
choices?: Array<{ message?: { content?: string } }>
}
return data.choices?.[0]?.message?.content?.trim() ?? ''
} finally {
clearTimeout(timeout)
}
}
/**
* Call the chat completions API with text-only input (no images).
*/
async function callChatApiText(
endpoint: string,
model: string,
systemPrompt: string,
userMessage: string
): Promise<string> {
const url = endpoint.replace(/\/+$/, '') + '/chat/completions'
const controller = new AbortController()
const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS)
try {
const res = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
signal: controller.signal,
body: JSON.stringify({
model,
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userMessage },
],
max_tokens: 8192,
temperature: 0.1,
}),
})
if (!res.ok) {
const text = await res.text().catch(() => '')
throw new Error(`LLM API returned ${res.status}: ${text.slice(0, 200)}`)
}
const data = await res.json() as {
choices?: Array<{ message?: { content?: string } }>
}
return data.choices?.[0]?.message?.content?.trim() ?? ''
} finally {
clearTimeout(timeout)
}
}
// ─── AI description ──────────────────────────────────────────────────────────
/**
* Generate an AI description for a media item using a vision model.
* Stores the result in the ai_description column and returns it.
*/
export async function generateItemDescription(itemKey: string): Promise<string> {
const config = getAiConfig()
if (!config.endpoint || !config.model) {
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
}
const libraryId = itemKey.split(':')[0]
const db = getDb()
const item = db
.prepare('SELECT item_key, item_type, file_path, metadata FROM media_items WHERE item_key = ?')
.get(itemKey) as MediaItemRow | undefined
if (!item) {
throw Object.assign(new Error(`Item not found: ${itemKey}`), { code: 'NOT_FOUND' })
}
const library = getLibrary(libraryId)
if (!library) {
throw Object.assign(new Error(`Library not found: ${libraryId}`), { code: 'NOT_FOUND' })
}
const libraryRoot = resolveLibraryRoot(library)
const resolvedMedia = resolveItemImage(libraryRoot, item)
if (!resolvedMedia) {
throw Object.assign(new Error('No image available for this item'), { code: 'NO_IMAGE' })
}
let base64Images: string[]
if (resolvedMedia.mediaType === 'video') {
const framePaths = await getVideoFramePaths(resolvedMedia.path, libraryId, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else {
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image')
base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
}
const systemPrompt = 'You are a media cataloging assistant. Describe the given image briefly and objectively in 1-3 sentences. Focus on the visual content, subjects, setting, and mood. Do not speculate about context outside the image. Do not preface the description with any phrases like "This image shows" or "This image features". Return only the description text with no additional commentary.'
const description = await callVisionApiText(config.endpoint, config.model, base64Images, systemPrompt)
db.prepare('UPDATE media_items SET ai_description = ? WHERE item_key = ?').run(description, itemKey)
return description
}
// ─── Text extraction ─────────────────────────────────────────────────────────
/**
* Extract text (OCR) from an image using the vision model.
* Only works for images in mixed libraries.
* If the extracted text is not in the user's preferred language, auto-translates it.
* Returns { extractedText, translatedText }.
*/
export async function extractItemText(itemKey: string): Promise<{ extractedText: string; translatedText: string | null }> {
const config = getAiConfig()
if (!config.endpoint || !config.model) {
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
}
const libraryId = itemKey.split(':')[0]
const db = getDb()
const item = db
.prepare('SELECT item_key, item_type, file_path, metadata FROM media_items WHERE item_key = ?')
.get(itemKey) as MediaItemRow | undefined
if (!item) {
throw Object.assign(new Error(`Item not found: ${itemKey}`), { code: 'NOT_FOUND' })
}
if (item.item_type !== 'mixed_file') {
throw Object.assign(new Error('Text extraction is only available for mixed library items'), { code: 'INVALID_TYPE' })
}
const library = getLibrary(libraryId)
if (!library) {
throw Object.assign(new Error(`Library not found: ${libraryId}`), { code: 'NOT_FOUND' })
}
if (library.type !== 'mixed') {
throw Object.assign(new Error('Text extraction is only available for mixed libraries'), { code: 'INVALID_TYPE' })
}
const libraryRoot = resolveLibraryRoot(library)
const resolvedMedia = resolveItemImage(libraryRoot, item)
if (!resolvedMedia || resolvedMedia.mediaType !== 'image') {
throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' })
}
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image')
const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
const systemPrompt = 'You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting. Be mindful of different colors of text that may indicate different speakers or emphasis. If there is no text in the image, respond with exactly: [NO TEXT]'
const extractedText = await callVisionApiText(config.endpoint, config.model, base64Images, systemPrompt)
if (!extractedText || extractedText === '[NO TEXT]') {
db.prepare('UPDATE media_items SET extracted_text = NULL, extracted_text_translated = NULL WHERE item_key = ?').run(itemKey)
return { extractedText: '', translatedText: null }
}
db.prepare('UPDATE media_items SET extracted_text = ? WHERE item_key = ?').run(extractedText, itemKey)
// Auto-translate if preferred language is set
const preferredLanguage = getPreferredLanguage()
let translatedText: string | null = null
if (preferredLanguage) {
try {
translatedText = await translateText(config.endpoint, config.model, extractedText, preferredLanguage)
if (translatedText) {
db.prepare('UPDATE media_items SET extracted_text_translated = ? WHERE item_key = ?').run(translatedText, itemKey)
}
} catch (err) {
console.warn(`[ai-tagger] Translation failed for "${itemKey}":`, err instanceof Error ? err.message : err)
}
}
return { extractedText, translatedText }
}
/**
* Translate the extracted_text of an item into the preferred language.
* Returns the translated text or null if no text to translate.
*/
export async function translateItemText(itemKey: string): Promise<string | null> {
const config = getAiConfig()
if (!config.endpoint || !config.model) {
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
}
const db = getDb()
const row = db
.prepare('SELECT extracted_text FROM media_items WHERE item_key = ?')
.get(itemKey) as { extracted_text: string | null } | undefined
if (!row) {
throw Object.assign(new Error(`Item not found: ${itemKey}`), { code: 'NOT_FOUND' })
}
if (!row.extracted_text) {
return null
}
const preferredLanguage = getPreferredLanguage()
if (!preferredLanguage) return null
const translatedText = await translateText(config.endpoint, config.model, row.extracted_text, preferredLanguage)
if (translatedText) {
db.prepare('UPDATE media_items SET extracted_text_translated = ? WHERE item_key = ?').run(translatedText, itemKey)
}
return translatedText
}
/**
* Translate text to a target language using the chat API.
* Returns null if the text is already in the target language.
*/
async function translateText(
endpoint: string,
model: string,
text: string,
targetLanguage: string
): Promise<string | null> {
const systemPrompt = `You are a translator. Determine if the following text is already in ${targetLanguage}. If it is, respond with exactly: [ALREADY_TARGET_LANGUAGE]. If it is not, translate it to ${targetLanguage}. Return ONLY the translated text with no additional commentary.`
const result = await callChatApiText(endpoint, model, systemPrompt, text)
if (result === '[ALREADY_TARGET_LANGUAGE]' || !result) {
return null
}
return result
}
/**
* Extract text from all images in a directory within a mixed library.
* Returns the number of items processed.
*/
export async function extractDirectoryText(libraryId: string, dirPath: string): Promise<number> {
const config = getAiConfig()
if (!config.endpoint || !config.model) {
throw Object.assign(new Error('AI endpoint and model are not configured'), { code: 'NOT_CONFIGURED' })
}
const library = getLibrary(libraryId)
if (!library) {
throw Object.assign(new Error(`Library not found: ${libraryId}`), { code: 'NOT_FOUND' })
}
if (library.type !== 'mixed') {
throw Object.assign(new Error('Text extraction is only available for mixed libraries'), { code: 'INVALID_TYPE' })
}
const db = getDb()
const prefix = dirPath
? `${libraryId}:mixed_file:${encodeURIComponent(dirPath + '/')}`
: `${libraryId}:mixed_file:`
const items = db
.prepare('SELECT item_key, item_type, file_path, metadata FROM media_items WHERE item_key LIKE ? AND item_type = ?')
.all(`${prefix}%`, 'mixed_file') as MediaItemRow[]
const libraryRoot = resolveLibraryRoot(library)
let processed = 0
for (const item of items) {
// Only process images
if (!item.file_path) continue
const ext = path.extname(item.file_path).toLowerCase()
if (!IMAGE_EXTENSIONS.has(ext)) continue
try {
await extractItemText(item.item_key)
processed++
} catch (err) {
console.warn(
`[ai-tagger] Failed to extract text from "${item.item_key}":`,
err instanceof Error ? err.message : err
)
}
}
return processed
}
/**
* Get the AI fields (description, extracted text, translation) for a media item.
*/
export function getAiFields(itemKey: string): { aiDescription: string | null; extractedText: string | null; extractedTextTranslated: string | null } {
const db = getDb()
const row = db
.prepare('SELECT ai_description, extracted_text, extracted_text_translated FROM media_items WHERE item_key = ?')
.get(itemKey) as { ai_description: string | null; extracted_text: string | null; extracted_text_translated: string | null } | undefined
if (!row) {
return { aiDescription: null, extractedText: null, extractedTextTranslated: null }
}
return {
aiDescription: row.ai_description,
extractedText: row.extracted_text,
extractedTextTranslated: row.extracted_text_translated,
}
}

View File

@@ -57,3 +57,11 @@ export function updateAiConfig(endpoint: string, model: string, enabled: boolean
setSetting('ai_model', model)
setSetting('ai_enabled', enabled ? 'true' : 'false')
}
export function getPreferredLanguage(): string {
return getSetting('preferred_language') ?? 'English'
}
export function setPreferredLanguage(language: string): void {
setSetting('preferred_language', language)
}

View File

@@ -103,6 +103,7 @@ function initDb(db: Database.Database): void {
migrateMediaItemsFingerprint(db)
migrateMediaTagsToItemKey(db)
migrateMediaItemsAiTagged(db)
migrateMediaItemsAiFields(db)
seedAppSettings(db)
}
@@ -114,6 +115,7 @@ function seedAppSettings(db: Database.Database): void {
ai_enabled: 'false',
ai_endpoint: '',
ai_model: '',
preferred_language: 'English',
}
const insert = db.prepare(
'INSERT OR IGNORE INTO app_settings (key, value) VALUES (?, ?)'
@@ -241,6 +243,22 @@ function migrateMediaItemsAiTagged(db: Database.Database): void {
}
}
function migrateMediaItemsAiFields(db: Database.Database): void {
const row = db
.prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='media_items'")
.get() as { sql: string } | undefined
if (!row) return
if (!row.sql.includes('ai_description')) {
db.exec('ALTER TABLE media_items ADD COLUMN ai_description TEXT')
}
if (!row.sql.includes('extracted_text')) {
db.exec('ALTER TABLE media_items ADD COLUMN extracted_text TEXT')
}
if (!row.sql.includes('extracted_text_translated')) {
db.exec('ALTER TABLE media_items ADD COLUMN extracted_text_translated TEXT')
}
}
function migrateLibrariesType(db: Database.Database): void {
const row = db
.prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='libraries'")