handle video tagging

2026-04-12 17:24:39 -04:00
parent ad9920a448
commit 6c769b457f
6 changed files with 178 additions and 52 deletions
--- a/src/components/mixed/ImageLightbox.tsx
+++ b/src/components/mixed/ImageLightbox.tsx
@@ -21,6 +21,7 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
  )
  const [aiTagging, setAiTagging] = useState(false)
  const [aiTagError, setAiTagError] = useState<string | null>(null)
+  const [tagRefreshKey, setTagRefreshKey] = useState(0)

  useEffect(() => {
    const handleKey = (e: KeyboardEvent) => {
@@ -82,6 +83,7 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
                setAiTagError(null)
                try {
                  await onAiTag()
+                  setTagRefreshKey((k) => k + 1)
                  onTagsChanged?.()
                } catch (err) {
                  setAiTagError(err instanceof Error ? err.message : 'AI tagging failed')
@@ -165,7 +167,7 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
            <p className="text-xs font-semibold uppercase tracking-wider mb-3" style={{ color: 'var(--text-secondary)' }}>
              Tags
            </p>
-            <TagSelector itemKey={itemKey!} onTagsChanged={onTagsChanged} />
+            <TagSelector itemKey={itemKey!} onTagsChanged={onTagsChanged} refreshKey={tagRefreshKey} />
          </div>
        </div>
      ) : (
--- a/src/components/mixed/MixedView.tsx
+++ b/src/components/mixed/MixedView.tsx
@@ -378,6 +378,19 @@ export default function MixedView({ libraryId, initialPath }: Props) {
          onClose={() => setModal(null)}
          onPrev={modal.mediaIndex > 0 ? () => navigateModal(-1) : undefined}
          onNext={modal.mediaIndex < mediaEntries.length - 1 ? () => navigateModal(1) : undefined}
+          onAiTag={modal.itemKey ? async () => {
+            const res = await fetch('/api/ai-tagging', {
+              method: 'POST',
+              headers: { 'Content-Type': 'application/json' },
+              body: JSON.stringify({ itemKey: modal.itemKey }),
+            })
+            if (!res.ok) {
+              const data = await res.json().catch(() => ({}))
+              throw new Error((data as { error?: string }).error ?? 'AI tagging failed')
+            }
+            fetchAssignments()
+            setFilterRefreshKey((k) => k + 1)
+          } : undefined}
        />
      )}
      {modal?.type === 'image' && (
--- a/src/components/mixed/VideoPlayerModal.tsx
+++ b/src/components/mixed/VideoPlayerModal.tsx
@@ -12,10 +12,11 @@ interface Props {
  onNext?: () => void
  itemKey?: string
  onTagsChanged?: () => void
+  onAiTag?: () => Promise<void>
  context?: 'mixed' | 'movies' | 'tv'
 }

-export default function VideoPlayerModal({ url, name, onClose, onPrev, onNext, itemKey, onTagsChanged, context = 'mixed' }: Props) {
+export default function VideoPlayerModal({ url, name, onClose, onPrev, onNext, itemKey, onTagsChanged, onAiTag, context = 'mixed' }: Props) {
  const settings = useUserSettings()
  const autoPlay = context === 'mixed' ? settings.mixedAutoplay : context === 'movies' ? settings.moviesAutoplay : settings.tvAutoplay
  const loop     = context === 'mixed' ? settings.mixedLoop     : context === 'movies' ? settings.moviesLoop     : settings.tvLoop
@@ -24,6 +25,9 @@ export default function VideoPlayerModal({ url, name, onClose, onPrev, onNext, i
  const [showTags, setShowTags] = useState(
    () => !!itemKey && typeof window !== 'undefined' && window.innerWidth >= 1280
  )
+  const [aiTagging, setAiTagging] = useState(false)
+  const [aiTagError, setAiTagError] = useState<string | null>(null)
+  const [tagRefreshKey, setTagRefreshKey] = useState(0)

  useEffect(() => {
    const handleKey = (e: KeyboardEvent) => {
@@ -76,6 +80,43 @@ export default function VideoPlayerModal({ url, name, onClose, onPrev, onNext, i
              🏷
            </button>
          )}
+          {onAiTag && (
+            <button
+              onClick={async (e) => {
+                e.stopPropagation()
+                setAiTagging(true)
+                setAiTagError(null)
+                try {
+                  await onAiTag()
+                  setTagRefreshKey((k) => k + 1)
+                  onTagsChanged?.()
+                } catch (err) {
+                  setAiTagError(err instanceof Error ? err.message : 'AI tagging failed')
+                  setTimeout(() => setAiTagError(null), 4000)
+                } finally {
+                  setAiTagging(false)
+                }
+              }}
+              disabled={aiTagging}
+              className="w-8 h-8 rounded-full flex items-center justify-center text-sm transition-colors disabled:opacity-50"
+              style={{
+                backgroundColor: aiTagError ? '#7f1d1d' : 'var(--surface)',
+                color: aiTagError ? '#fca5a5' : 'var(--text-primary)',
+              }}
+              onMouseEnter={(e) => {
+                if (!aiTagging && !aiTagError) (e.currentTarget as HTMLElement).style.backgroundColor = 'var(--surface-hover)'
+              }}
+              onMouseLeave={(e) => {
+                if (!aiTagError) (e.currentTarget as HTMLElement).style.backgroundColor = 'var(--surface)'
+              }}
+              aria-label="AI Tag this video"
+              title={aiTagError ?? (aiTagging ? 'Tagging…' : 'AI Tag')}
+            >
+              {aiTagging ? (
+                <span className="animate-spin" style={{ display: 'inline-block' }}>⟳</span>
+              ) : '✨'}
+            </button>
+          )}
          <button
            onClick={onClose}
            className="w-8 h-8 rounded-full flex items-center justify-center text-sm flex-shrink-0 transition-colors"
@@ -134,7 +175,7 @@ export default function VideoPlayerModal({ url, name, onClose, onPrev, onNext, i
            <p className="text-xs font-semibold uppercase tracking-wider mb-3" style={{ color: 'var(--text-secondary)' }}>
              Tags
            </p>
-            <TagSelector itemKey={itemKey!} onTagsChanged={onTagsChanged} />
+            <TagSelector itemKey={itemKey!} onTagsChanged={onTagsChanged} refreshKey={tagRefreshKey} />
          </div>
        </div>
      ) : (
--- a/src/components/tags/TagSelector.tsx
+++ b/src/components/tags/TagSelector.tsx
@@ -7,6 +7,7 @@ import TagBadge from './TagBadge'
 interface Props {
  itemKey: string
  onTagsChanged?: () => void
+  refreshKey?: number
 }

 interface AllTags {
@@ -14,7 +15,7 @@ interface AllTags {
  tags: Tag[]
 }

-export default function TagSelector({ itemKey, onTagsChanged }: Props) {
+export default function TagSelector({ itemKey, onTagsChanged, refreshKey }: Props) {
  const [assigned, setAssigned] = useState<{ tags: Tag[]; categories: TagCategory[] }>({
    tags: [],
    categories: [],
@@ -58,6 +59,12 @@ export default function TagSelector({ itemKey, onTagsChanged }: Props) {
    Promise.all([fetchAssigned(), fetchAll()]).finally(() => setLoading(false))
  }, [fetchAssigned, fetchAll])

+  useEffect(() => {
+    if (refreshKey !== undefined && refreshKey > 0) {
+      fetchAssigned()
+    }
+  }, [refreshKey, fetchAssigned])
+
  const isAssigned = (tagId: string) => assigned.tags.some((t) => t.id === tagId)

  const toggleTag = async (tag: Tag) => {
--- a/src/lib/ai-tagger.ts
+++ b/src/lib/ai-tagger.ts
@@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types'
 import { getDb } from './db'
 import { getAiConfig } from './app-settings'
 import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
-import { getThumbnailPath } from './thumbnails'
+import { getThumbnailPath, getVideoFramePaths } from './thumbnails'
 import { findFile } from './media-utils'
 import { getLibrary, resolveLibraryRoot } from './libraries'

@@ -13,6 +13,14 @@ const REQUEST_TIMEOUT_MS = 30_000
 const MAX_CONSECUTIVE_FAILURES = 3

 const IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.tif'])
+const VIDEO_EXTENSIONS = new Set(['.mp4', '.mkv', '.avi', '.mov', '.wmv', '.m4v', '.webm', '.flv', '.ts', '.mpg', '.mpeg'])
+
+const VIDEO_FRAME_PERCENTAGES = [0.10, 0.25, 0.50, 0.75, 0.90]
+
+interface ResolvedMedia {
+  path: string
+  mediaType: 'image' | 'video'
+}

 interface MediaItemRow {
  item_key: string
@@ -22,10 +30,10 @@ interface MediaItemRow {
 }

 /**
- * Resolve the absolute path to the best image for a media item.
- * Returns null if no suitable image is found.
+ * Resolve the absolute path to the best image (or video) for a media item.
+ * Returns null if no suitable media is found.
 */
-function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | null {
+function resolveItemImage(libraryRoot: string, item: MediaItemRow): ResolvedMedia | null {
  switch (item.item_type) {
    case 'movie':
    case 'tv_series': {
@@ -40,7 +48,7 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
        )
        if (!relPath) return null
        const absPath = path.join(libraryRoot, relPath)
-        if (fs.existsSync(absPath)) return absPath
+        if (fs.existsSync(absPath)) return { path: absPath, mediaType: 'image' }
      } catch {
        return null
      }
@@ -58,7 +66,7 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
        )
        if (!relPath) return null
        const absPath = path.join(libraryRoot, relPath)
-        if (fs.existsSync(absPath)) return absPath
+        if (fs.existsSync(absPath)) return { path: absPath, mediaType: 'image' }
      } catch {
        return null
      }
@@ -70,16 +78,16 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
      if (!item.file_path) return null
      const seasonDir = path.join(libraryRoot, item.file_path)
      const posterFile = findFile(seasonDir, /^(poster|cover|folder)$/i)
-      if (posterFile) return path.join(seasonDir, posterFile)
+      if (posterFile) return { path: path.join(seasonDir, posterFile), mediaType: 'image' }
      return null
    }

    case 'mixed_file': {
-      // For mixed files, tag only actual images (not videos or other files)
      if (!item.file_path) return null
      const ext = path.extname(item.file_path).toLowerCase()
-      if (!IMAGE_EXTENSIONS.has(ext)) return null
-      return path.join(libraryRoot, item.file_path)
+      if (IMAGE_EXTENSIONS.has(ext)) return { path: path.join(libraryRoot, item.file_path), mediaType: 'image' }
+      if (VIDEO_EXTENSIONS.has(ext)) return { path: path.join(libraryRoot, item.file_path), mediaType: 'video' }
+      return null
    }

    default:
@@ -90,9 +98,9 @@ function resolveItemImage(libraryRoot: string, item: MediaItemRow): string | nul
 /**
 * Build the system prompt that instructs the LLM to select matching tags.
 * If currentTags are provided they are included as context to help the model
- * understand the image before selecting additional tags.
+ * understand the content before selecting additional tags.
 */
-function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Tag[]): string {
+function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Tag[], mediaContext: 'image' | 'video' = 'image'): string {
  const categoryMap = new Map(categories.map((c) => [c.id, c.name]))

  const grouped: Record<string, { id: string; name: string }[]> = {}
@@ -107,17 +115,20 @@ function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Ta
    lines.push(`[${catName}] ${tagList}`)
  }

+  const isVideo = mediaContext === 'video'
+  const contentWord = isVideo ? 'video frames' : 'image'
+
  const parts: string[] = [
-    'You are an image tagger. Given the image, select which of the following tags apply.',
-    'Return ONLY a JSON array of tag IDs that match the image (e.g., ["tag-apple", "tag-orange"]). Do not invent new tags. Do not return any text other than what is inside the JSON array.',
+    `You are a media tagger. Given the ${contentWord}, select which of the following tags apply.`,
+    'Return ONLY a JSON array of tag IDs that match (e.g., ["tag-apple", "tag-orange"]). Do not invent new tags. Do not return any text other than what is inside the JSON array.',
    'If no tags match, return an empty array (e.i., [])',
  ]

  if (currentTags && currentTags.length > 0) {
    const currentTagNames = currentTags.map((t) => t.name).join(', ')
    parts.push('')
-    parts.push(`This image already has the following tags applied: ${currentTagNames}`)
-    parts.push('Use these as context to better understand the image when selecting tags.')
+    parts.push(`This content already has the following tags applied: ${currentTagNames}`)
+    parts.push('Use these as context to better understand the content when selecting tags.')
  }

  parts.push('')
@@ -128,12 +139,12 @@ function buildTagPrompt(tags: Tag[], categories: TagCategory[], currentTags?: Ta
 }

 /**
- * Call the OpenAI-compatible vision API to get tag suggestions for an image.
+ * Call the OpenAI-compatible vision API to get tag suggestions for one or more images.
 */
 async function callVisionApi(
  endpoint: string,
  model: string,
-  base64Image: string,
+  base64Images: string[],
  systemPrompt: string
 ): Promise<string[]> {
  const url = endpoint.replace(/\/+$/, '') + '/chat/completions'
@@ -152,12 +163,10 @@ async function callVisionApi(
          { role: 'system', content: systemPrompt },
          {
            role: 'user',
-            content: [
-              {
-                type: 'image_url',
-                image_url: { url: `data:image/jpeg;base64,${base64Image}` },
-              },
-            ],
+            content: base64Images.map((b64) => ({
+              type: 'image_url',
+              image_url: { url: `data:image/jpeg;base64,${b64}` },
+            })),
          },
        ],
        max_tokens: 8192,
@@ -230,22 +239,27 @@ export async function runAiTagging(library: Library, libraryRoot: string): Promi
      break
    }

-    const imagePath = resolveItemImage(libraryRoot, item)
-    if (!imagePath) {
-      // No image available — mark as tagged so we don't retry every scan
+    const resolvedMedia = resolveItemImage(libraryRoot, item)
+    if (!resolvedMedia) {
+      // No image or video available — mark as tagged so we don't retry every scan
      markTagged.run(Date.now(), item.item_key)
      continue
    }

    try {
-      // Use the thumbnail cache for a smaller image
-      const thumbnailPath = await getThumbnailPath(imagePath, library.id, 'image')
-      const base64 = fs.readFileSync(thumbnailPath, 'base64')
+      let base64Images: string[]
+      if (resolvedMedia.mediaType === 'video') {
+        const framePaths = await getVideoFramePaths(resolvedMedia.path, library.id, VIDEO_FRAME_PERCENTAGES)
+        base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
+      } else {
+        const thumbnailPath = await getThumbnailPath(resolvedMedia.path, library.id, 'image')
+        base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
+      }

      const { tags: currentItemTags } = getResolvedTagsForItem(item.item_key)
-      const systemPrompt = buildTagPrompt(tags, categories, currentItemTags)
+      const systemPrompt = buildTagPrompt(tags, categories, currentItemTags, resolvedMedia.mediaType)

-      const suggestedIds = await callVisionApi(config.endpoint, config.model, base64, systemPrompt)
+      const suggestedIds = await callVisionApi(config.endpoint, config.model, base64Images, systemPrompt)

      // Filter to valid tags only
      const validIds = suggestedIds.filter((id) => validTagIds.has(id))
@@ -314,13 +328,19 @@ export async function tagSingleItem(itemKey: string): Promise<string[]> {
    throw Object.assign(new Error('No image available for this item'), { code: 'NO_IMAGE' })
  }

-  const thumbnailPath = await getThumbnailPath(imagePath, libraryId, 'image')
-  const base64 = fs.readFileSync(thumbnailPath, 'base64')
+  let base64Images: string[]
+  if (imagePath.mediaType === 'video') {
+    const framePaths = await getVideoFramePaths(imagePath.path, libraryId, VIDEO_FRAME_PERCENTAGES)
+    base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
+  } else {
+    const thumbnailPath = await getThumbnailPath(imagePath.path, libraryId, 'image')
+    base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
+  }

  const { tags: currentItemTags } = getResolvedTagsForItem(itemKey)
-  const systemPromptWithContext = buildTagPrompt(tags, categories, currentItemTags)
+  const systemPromptWithContext = buildTagPrompt(tags, categories, currentItemTags, imagePath.mediaType)

-  const suggestedIds = await callVisionApi(config.endpoint, config.model, base64, systemPromptWithContext)
+  const suggestedIds = await callVisionApi(config.endpoint, config.model, base64Images, systemPromptWithContext)
  const validIds = suggestedIds.filter((id) => validTagIds.has(id))

  for (const tagId of validIds) {
--- a/src/lib/thumbnails.ts
+++ b/src/lib/thumbnails.ts
@@ -87,22 +87,13 @@ async function getVideoDuration(src: string): Promise<number> {
  })
 }

-/** Generate a thumbnail from a video using ffmpeg. */
-async function generateVideoThumbnail(src: string, dest: string): Promise<void> {
+/** Extract a single frame from a video at the given offset (seconds) and write to dest. */
+async function generateVideoFrameAtOffset(src: string, dest: string, offsetSeconds: number): Promise<void> {
  const tmp = dest + '.tmp'

-  // Seek to 10% of the video duration for a representative frame
-  let offset = 0
-  try {
-    const duration = await getVideoDuration(src)
-    offset = Math.max(0, duration * 0.1)
-  } catch {
-    // If ffprobe fails, fall back to seeking to 0
-  }
-
  const args = [
    '-y',                        // overwrite output
-    '-ss', String(offset),       // seek before input (fast)
+    '-ss', String(offsetSeconds), // seek before input (fast)
    '-i', src,
    '-frames:v', '1',
    '-q:v', '5',
@@ -115,6 +106,58 @@ async function generateVideoThumbnail(src: string, dest: string): Promise<void>
  fs.renameSync(tmp, dest)
 }

+/** Generate a thumbnail from a video using ffmpeg (seeks to 10% of duration). */
+async function generateVideoThumbnail(src: string, dest: string): Promise<void> {
+  let offset = 0
+  try {
+    const duration = await getVideoDuration(src)
+    offset = Math.max(0, duration * 0.1)
+  } catch {
+    // If ffprobe fails, fall back to seeking to 0
+  }
+  await generateVideoFrameAtOffset(src, dest, offset)
+}
+
+/**
+ * Extract frames from a video at each given percentage of its duration.
+ * Returns the absolute paths to the cached frame JPEGs, in the same order as `percentages`.
+ * Uses a per-frame cache key so each frame is cached independently.
+ */
+export async function getVideoFramePaths(
+  absoluteFilePath: string,
+  libraryId: string,
+  percentages: number[]
+): Promise<string[]> {
+  ensureCacheDir()
+
+  let duration = 0
+  try {
+    duration = await getVideoDuration(absoluteFilePath)
+  } catch {
+    // Fall back to 0; all frames will seek to position 0
+  }
+
+  const framePaths: string[] = []
+
+  for (const pct of percentages) {
+    const offset = Math.max(0, duration * pct)
+    const key = crypto
+      .createHash('sha1')
+      .update(libraryId + ':' + absoluteFilePath + ':' + pct)
+      .digest('hex')
+    const cacheFile = path.join(CACHE_DIR, key + '.jpg')
+
+    const cached = getCachedPath(cacheFile, absoluteFilePath)
+    if (!cached) {
+      await generateVideoFrameAtOffset(absoluteFilePath, cacheFile, offset)
+    }
+
+    framePaths.push(cacheFile)
+  }
+
+  return framePaths
+}
+
 /**
 * Returns the absolute path to a cached thumbnail JPEG for the given file.
 * Generates it on first call (or when the source has been modified).