handle video tagging

This commit is contained in:
Garret Patti
2026-04-12 17:24:39 -04:00
parent ad9920a448
commit 6c769b457f
6 changed files with 178 additions and 52 deletions

View File

@@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types'
import { getDb } from './db'
import { getAiConfig } from './app-settings'
import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
import { getThumbnailPath } from './thumbnails'
import { getThumbnailPath, getVideoFramePaths } from './thumbnails'
import { findFile } from './media-utils'
import { getLibrary, resolveLibraryRoot } from './libraries'
@@ -13,6 +13,14 @@ const REQUEST_TIMEOUT_MS = 30_000
const MAX_CONSECUTIVE_FAILURES = 3
const IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.tif'])
const VIDEO_EXTENSIONS = new Set(['.mp4', '.mkv', '.avi', '.mov', '.wmv', '.m4v', '.webm', '.flv', '.ts', '.mpg', '.mpeg'])
const VIDEO_FRAME_PERCENTAGES = [0.10, 0.25, 0.50, 0.75, 0.90]
interface ResolvedMedia {
path: string
mediaType: 'image' | 'video'
}
interface MediaItemRow {
item_key: string
@@ -22,10 +30,10 @@ interface MediaItemRow {
}
/**
* Resolve the absolute path to the best image for a media item.
* Returns null if no suitable image is found.
* Resolve the absolute path to the best image (or video) for a media item.
* Returns null if no suitable media is found.
*/
function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | null {
function resolveItemImage(libraryRoot: string, item: MediaItemRow): ResolvedMedia | null {
switch (item.item_type) {
case 'movie':
case 'tv_series': {
@@ -40,7 +48,7 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
)
if (!relPath) return null
const absPath = path.join(libraryRoot, relPath)
if (fs.existsSync(absPath)) return absPath
if (fs.existsSync(absPath)) return { path: absPath, mediaType: 'image' }
} catch {
return null
}
@@ -58,7 +66,7 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
)
if (!relPath) return null
const absPath = path.join(libraryRoot, relPath)
if (fs.existsSync(absPath)) return absPath
if (fs.existsSync(absPath)) return { path: absPath, mediaType: 'image' }
} catch {
return null
}
@@ -70,16 +78,16 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
if (!item.file_path) return null
const seasonDir = path.join(libraryRoot, item.file_path)
const posterFile = findFile(seasonDir, /^(poster|cover|folder)$/i)
if (posterFile) return path.join(seasonDir, posterFile)
if (posterFile) return { path: path.join(seasonDir, posterFile), mediaType: 'image' }
return null
}
case 'mixed_file': {
// For mixed files, tag only actual images (not videos or other files)
if (!item.file_path) return null
const ext = path.extname(item.file_path).toLowerCase()
if (!IMAGE_EXTENSIONS.has(ext)) return null
return path.join(libraryRoot, item.file_path)
if (IMAGE_EXTENSIONS.has(ext)) return { path: path.join(libraryRoot, item.file_path), mediaType: 'image' }
if (VIDEO_EXTENSIONS.has(ext)) return { path: path.join(libraryRoot, item.file_path), mediaType: 'video' }
return null
}
default:
@@ -90,9 +98,9 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
/**
* Build the system prompt that instructs the LLM to select matching tags.
* If currentTags are provided they are included as context to help the model
* understand the image before selecting additional tags.
* understand the content before selecting additional tags.
*/
function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Tag[]): string {
function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Tag[], mediaContext: 'image' | 'video' = 'image'): string {
const categoryMap = new Map(categories.map((c) => [c.id, c.name]))
const grouped: Record<string, { id: string; name: string }[]> = {}
@@ -107,17 +115,20 @@ function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Ta
lines.push(`[${catName}] ${tagList}`)
}
const isVideo = mediaContext === 'video'
const contentWord = isVideo ? 'video frames' : 'image'
const parts: string[] = [
'You are an image tagger. Given the image, select which of the following tags apply.',
'Return ONLY a JSON array of tag IDs that match the image (e.g., ["tag-apple", "tag-orange"]). Do not invent new tags. Do not return any text other than what is inside the JSON array.',
`You are a media tagger. Given the ${contentWord}, select which of the following tags apply.`,
'Return ONLY a JSON array of tag IDs that match (e.g., ["tag-apple", "tag-orange"]). Do not invent new tags. Do not return any text other than what is inside the JSON array.',
'If no tags match, return an empty array (e.i., [])',
]
if (currentTags && currentTags.length > 0) {
const currentTagNames = currentTags.map((t) => t.name).join(', ')
parts.push('')
parts.push(`This image already has the following tags applied: ${currentTagNames}`)
parts.push('Use these as context to better understand the image when selecting tags.')
parts.push(`This content already has the following tags applied: ${currentTagNames}`)
parts.push('Use these as context to better understand the content when selecting tags.')
}
parts.push('')
@@ -128,12 +139,12 @@ function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Ta
}
/**
* Call the OpenAI-compatible vision API to get tag suggestions for an image.
* Call the OpenAI-compatible vision API to get tag suggestions for one or more images.
*/
async function callVisionApi(
endpoint: string,
model: string,
base64Image: string,
base64Images: string[],
systemPrompt: string
): Promise<string[]> {
const url = endpoint.replace(/\/+$/, '') + '/chat/completions'
@@ -152,12 +163,10 @@ async function callVisionApi(
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${base64Image}` },
},
],
content: base64Images.map((b64) => ({
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${b64}` },
})),
},
],
max_tokens: 8192,
@@ -230,22 +239,27 @@ export async function runAiTagging(library: Library, libraryRoot: string): Promi
break
}
const imagePath = resolveItemImage(libraryRoot, item)
if (!imagePath) {
// No image available — mark as tagged so we don't retry every scan
const resolvedMedia = resolveItemImage(libraryRoot, item)
if (!resolvedMedia) {
// No image or video available — mark as tagged so we don't retry every scan
markTagged.run(Date.now(), item.item_key)
continue
}
try {
// Use the thumbnail cache for a smaller image
const thumbnailPath = await getThumbnailPath(imagePath, library.id, 'image')
const base64 = fs.readFileSync(thumbnailPath, 'base64')
let base64Images: string[]
if (resolvedMedia.mediaType === 'video') {
const framePaths = await getVideoFramePaths(resolvedMedia.path, library.id, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else {
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, library.id, 'image')
base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
}
const { tags: currentItemTags } = getResolvedTagsForItem(item.item_key)
const systemPrompt = buildTagPrompt(tags, categories, currentItemTags)
const systemPrompt = buildTagPrompt(tags, categories, currentItemTags, resolvedMedia.mediaType)
const suggestedIds = await callVisionApi(config.endpoint, config.model, base64, systemPrompt)
const suggestedIds = await callVisionApi(config.endpoint, config.model, base64Images, systemPrompt)
// Filter to valid tags only
const validIds = suggestedIds.filter((id) => validTagIds.has(id))
@@ -314,13 +328,19 @@ export async function tagSingleItem(itemKey: string): Promise<string[]> {
throw Object.assign(new Error('No image available for this item'), { code: 'NO_IMAGE' })
}
const thumbnailPath = await getThumbnailPath(imagePath, libraryId, 'image')
const base64 = fs.readFileSync(thumbnailPath, 'base64')
let base64Images: string[]
if (imagePath.mediaType === 'video') {
const framePaths = await getVideoFramePaths(imagePath.path, libraryId, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else {
const thumbnailPath = await getThumbnailPath(imagePath.path, libraryId, 'image')
base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
}
const { tags: currentItemTags } = getResolvedTagsForItem(itemKey)
const systemPromptWithContext = buildTagPrompt(tags, categories, currentItemTags)
const systemPromptWithContext = buildTagPrompt(tags, categories, currentItemTags, imagePath.mediaType)
const suggestedIds = await callVisionApi(config.endpoint, config.model, base64, systemPromptWithContext)
const suggestedIds = await callVisionApi(config.endpoint, config.model, base64Images, systemPromptWithContext)
const validIds = suggestedIds.filter((id) => validTagIds.has(id))
for (const tagId of validIds) {

View File

@@ -87,22 +87,13 @@ async function getVideoDuration(src: string): Promise<number> {
})
}
/** Generate a thumbnail from a video using ffmpeg. */
async function generateVideoThumbnail(src: string, dest: string): Promise<void> {
/** Extract a single frame from a video at the given offset (seconds) and write to dest. */
async function generateVideoFrameAtOffset(src: string, dest: string, offsetSeconds: number): Promise<void> {
const tmp = dest + '.tmp'
// Seek to 10% of the video duration for a representative frame
let offset = 0
try {
const duration = await getVideoDuration(src)
offset = Math.max(0, duration * 0.1)
} catch {
// If ffprobe fails, fall back to seeking to 0
}
const args = [
'-y', // overwrite output
'-ss', String(offset), // seek before input (fast)
'-ss', String(offsetSeconds), // seek before input (fast)
'-i', src,
'-frames:v', '1',
'-q:v', '5',
@@ -115,6 +106,58 @@ async function generateVideoThumbnail(src: string, dest: string): Promise<void>
fs.renameSync(tmp, dest)
}
/** Generate a thumbnail from a video using ffmpeg (seeks to 10% of duration). */
async function generateVideoThumbnail(src: string, dest: string): Promise<void> {
let offset = 0
try {
const duration = await getVideoDuration(src)
offset = Math.max(0, duration * 0.1)
} catch {
// If ffprobe fails, fall back to seeking to 0
}
await generateVideoFrameAtOffset(src, dest, offset)
}
/**
* Extract frames from a video at each given percentage of its duration.
* Returns the absolute paths to the cached frame JPEGs, in the same order as `percentages`.
* Uses a per-frame cache key so each frame is cached independently.
*/
export async function getVideoFramePaths(
absoluteFilePath: string,
libraryId: string,
percentages: number[]
): Promise<string[]> {
ensureCacheDir()
let duration = 0
try {
duration = await getVideoDuration(absoluteFilePath)
} catch {
// Fall back to 0; all frames will seek to position 0
}
const framePaths: string[] = []
for (const pct of percentages) {
const offset = Math.max(0, duration * pct)
const key = crypto
.createHash('sha1')
.update(libraryId + ':' + absoluteFilePath + ':' + pct)
.digest('hex')
const cacheFile = path.join(CACHE_DIR, key + '.jpg')
const cached = getCachedPath(cacheFile, absoluteFilePath)
if (!cached) {
await generateVideoFrameAtOffset(absoluteFilePath, cacheFile, offset)
}
framePaths.push(cacheFile)
}
return framePaths
}
/**
* Returns the absolute path to a cached thumbnail JPEG for the given file.
* Generates it on first call (or when the source has been modified).