send higher resolution images to AI vision endpoints

Add getAiImagePath() to thumbnails.ts (1920px wide, quality 90, no
upscaling) cached separately from display thumbnails via an _ai suffix.
Swap all four image-to-AI code paths in ai-tagger.ts (extract text,
describe, batch tagging x2) to use the new high-res image instead of
the 400px display thumbnail, improving OCR accuracy on dense text.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Garret Patti
2026-04-13 09:08:43 -04:00
parent 5ba73b2e56
commit cd9a83ea90
2 changed files with 37 additions and 5 deletions

View File

@@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types'
import { getDb } from './db' import { getDb } from './db'
import { getAiConfig, getEffectiveAiConfig, getPreferredLanguage } from './app-settings' import { getAiConfig, getEffectiveAiConfig, getPreferredLanguage } from './app-settings'
import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags' import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
import { getThumbnailPath, getVideoFramePaths } from './thumbnails' import { getAiImagePath, getVideoFramePaths } from './thumbnails'
import { findFile } from './media-utils' import { findFile } from './media-utils'
import { getLibrary, resolveLibraryRoot } from './libraries' import { getLibrary, resolveLibraryRoot } from './libraries'
@@ -279,7 +279,7 @@ export async function runAiTagging(library: Library, libraryRoot: string): Promi
const framePaths = await getVideoFramePaths(resolvedMedia.path, library.id, VIDEO_FRAME_PERCENTAGES) const framePaths = await getVideoFramePaths(resolvedMedia.path, library.id, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64')) base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else { } else {
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, library.id, 'image') const thumbnailPath = await getAiImagePath(resolvedMedia.path, library.id)
base64Images = [fs.readFileSync(thumbnailPath, 'base64')] base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
} }
@@ -367,7 +367,7 @@ export async function tagSingleItem(itemKey: string): Promise<string[]> {
const framePaths = await getVideoFramePaths(imagePath.path, libraryId, VIDEO_FRAME_PERCENTAGES) const framePaths = await getVideoFramePaths(imagePath.path, libraryId, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64')) base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else { } else {
const thumbnailPath = await getThumbnailPath(imagePath.path, libraryId, 'image') const thumbnailPath = await getAiImagePath(imagePath.path, libraryId)
base64Images = [fs.readFileSync(thumbnailPath, 'base64')] base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
} }
@@ -529,7 +529,7 @@ export async function generateItemDescription(itemKey: string): Promise<string>
const framePaths = await getVideoFramePaths(resolvedMedia.path, libraryId, VIDEO_FRAME_PERCENTAGES) const framePaths = await getVideoFramePaths(resolvedMedia.path, libraryId, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64')) base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else { } else {
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image') const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
base64Images = [fs.readFileSync(thumbnailPath, 'base64')] base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
} }
@@ -587,7 +587,7 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' }) throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' })
} }
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image') const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
const base64Images = [fs.readFileSync(thumbnailPath, 'base64')] const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${config.promptExtract ? ' ' + config.promptExtract : ''} If there is no text in the image, respond with exactly: [NO TEXT]` const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${config.promptExtract ? ' ' + config.promptExtract : ''} If there is no text in the image, respond with exactly: [NO TEXT]`

View File

@@ -7,6 +7,8 @@ import sharp from 'sharp'
const CACHE_DIR = path.resolve(process.cwd(), '.thumbnails') const CACHE_DIR = path.resolve(process.cwd(), '.thumbnails')
const THUMBNAIL_WIDTH = 400 const THUMBNAIL_WIDTH = 400
const JPEG_QUALITY = 75 const JPEG_QUALITY = 75
const AI_IMAGE_WIDTH = 1920
const AI_JPEG_QUALITY = 90
/** Ensure the cache directory exists. */ /** Ensure the cache directory exists. */
function ensureCacheDir(): void { function ensureCacheDir(): void {
@@ -47,6 +49,17 @@ async function generateImageThumbnail(src: string, dest: string): Promise<void>
fs.renameSync(tmp, dest) fs.renameSync(tmp, dest)
} }
/** Generate a high-resolution JPEG for AI vision use. Images smaller than
* AI_IMAGE_WIDTH are not upscaled — they are converted at their native size. */
async function generateAiImage(src: string, dest: string): Promise<void> {
const tmp = dest + '.tmp'
await sharp(src)
.resize(AI_IMAGE_WIDTH, undefined, { withoutEnlargement: true })
.jpeg({ quality: AI_JPEG_QUALITY })
.toFile(tmp)
fs.renameSync(tmp, dest)
}
/** Run a child process and collect stderr. Resolves on exit code 0, rejects otherwise. */ /** Run a child process and collect stderr. Resolves on exit code 0, rejects otherwise. */
function run(bin: string, args: string[]): Promise<void> { function run(bin: string, args: string[]): Promise<void> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
@@ -158,6 +171,25 @@ export async function getVideoFramePaths(
return framePaths return framePaths
} }
/**
* Returns the absolute path to a high-resolution JPEG suitable for AI vision
* APIs (1920px wide max, quality 90). Cached alongside display thumbnails with
* an `_ai` suffix so display performance is unaffected.
* Generates on first call or when the source file has been modified.
*/
export async function getAiImagePath(
absoluteFilePath: string,
libraryId: string
): Promise<string> {
ensureCacheDir()
const key = cacheKey(libraryId, absoluteFilePath)
const cacheFile = path.join(CACHE_DIR, key + '_ai.jpg')
const cached = getCachedPath(cacheFile, absoluteFilePath)
if (cached) return cached
await generateAiImage(absoluteFilePath, cacheFile)
return cacheFile
}
/** /**
* Returns the absolute path to a cached thumbnail JPEG for the given file. * Returns the absolute path to a cached thumbnail JPEG for the given file.
* Generates it on first call (or when the source has been modified). * Generates it on first call (or when the source has been modified).