Compare commits

...

2 Commits

Author SHA1 Message Date
Garret Patti
cd9a83ea90 send higher resolution images to AI vision endpoints
Add getAiImagePath() to thumbnails.ts (1920px wide, quality 90, no
upscaling) cached separately from display thumbnails via an _ai suffix.
Swap all four image-to-AI code paths in ai-tagger.ts (extract text,
describe, batch tagging x2) to use the new high-res image instead of
the 400px display thumbnail, improving OCR accuracy on dense text.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 09:08:43 -04:00
Garret Patti
5ba73b2e56 doom scroll and viewer improvements
- move play/pause to clicking the video directly; remove dedicated button
- replace emoji mute icons with flat minimal SVGs
- add view-in-library button in doom scroll that navigates to the file's
  directory and opens it in the regular viewer
- add display text overlay button in doom scroll and image lightbox;
  shows extracted text (translated by default when available) in a
  semi-transparent box at the bottom; toggle between translated/original
- hide tag panel by default in image lightbox and video player modal

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 08:16:34 -04:00
6 changed files with 249 additions and 25 deletions

View File

@@ -14,6 +14,7 @@ interface Props {
items: DoomScrollItem[]
videoContext?: 'mixed' | 'movies' | 'tv'
onClose: () => void
onViewInLibrary?: (item: DoomScrollItem) => void
}
const HISTORY_CAP = 100
@@ -26,7 +27,7 @@ function pickRandom(items: DoomScrollItem[], excludeRecent: DoomScrollItem[]): D
return pool[Math.floor(Math.random() * pool.length)]
}
export default function DoomScrollView({ items, videoContext = 'mixed', onClose }: Props) {
export default function DoomScrollView({ items, videoContext = 'mixed', onClose, onViewInLibrary }: Props) {
const settings = useUserSettings()
const settingsMuted = videoContext === 'mixed' ? settings.mixedMuted : videoContext === 'movies' ? settings.moviesMuted : settings.tvMuted
@@ -40,6 +41,12 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose
const [autoPlayEnabled, setAutoPlayEnabled] = useState(false)
const [autoPlaySeconds, setAutoPlaySeconds] = useState(5)
// Text overlay state
const [extractedText, setExtractedText] = useState<string | null>(null)
const [translatedText, setTranslatedText] = useState<string | null>(null)
const [showTextOverlay, setShowTextOverlay] = useState(false)
const [showOriginal, setShowOriginal] = useState(false)
const videoRef = useRef<HTMLVideoElement>(null)
const cooldownRef = useRef(false)
const touchStartY = useRef<number | null>(null)
@@ -48,6 +55,9 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose
const isVideo = current?.mediaType === 'video'
const backCount = history.length - 1 - historyIndex
// Derived: what text to display in the overlay
const displayText = (translatedText && !showOriginal) ? translatedText : extractedText
const goNext = useCallback(() => {
if (items.length === 0) return
setHistoryIndex((idx) => {
@@ -114,11 +124,30 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose
return () => clearTimeout(id)
}, [autoPlayEnabled, isPaused, autoPlaySeconds, current?.url, goNext])
// Fetch extracted text for current item
useEffect(() => {
setExtractedText(null)
setTranslatedText(null)
setShowTextOverlay(false)
setShowOriginal(false)
if (!current?.itemKey) return
fetch(`/api/ai-tagging/fields?itemKey=${encodeURIComponent(current.itemKey)}`)
.then((r) => r.json())
.then((data: { extractedText: string | null; extractedTextTranslated: string | null }) => {
setExtractedText(data.extractedText)
setTranslatedText(data.extractedTextTranslated)
})
.catch(() => {})
}, [current?.itemKey])
useEffect(() => {
const handleKey = (e: KeyboardEvent) => {
if (e.key === 'Escape') { onClose(); return }
if (e.key === 'ArrowDown' || e.key === ' ' || e.key === 'PageDown') { e.preventDefault(); navigate('next') }
if (e.key === 'ArrowUp' || e.key === 'PageUp') { e.preventDefault(); navigate('prev') }
if (e.key === 't' || e.key === 'T') {
if (extractedText) setShowTextOverlay((v) => !v)
}
}
const handleWheel = (e: WheelEvent) => {
e.preventDefault()
@@ -147,7 +176,7 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose
document.removeEventListener('touchend', handleTouchEnd)
document.body.style.overflow = ''
}
}, [navigate, onClose])
}, [navigate, onClose, extractedText])
return (
<div className="fixed inset-0 z-50 flex flex-col" style={{ backgroundColor: '#000' }}>
@@ -219,8 +248,9 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose
loop={!autoPlayEnabled}
muted={localMuted}
playsInline
className="max-w-full max-h-full object-contain"
className="max-w-full max-h-full object-contain cursor-pointer"
style={{ backgroundColor: '#000' }}
onClick={() => setIsPaused((v) => !v)}
/>
) : current?.mediaType === 'image' ? (
// eslint-disable-next-line @next/next/no-img-element
@@ -233,32 +263,88 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose
) : null}
</div>
{/* Bottom bar: mute | filename | play-pause */}
{/* Text overlay */}
{showTextOverlay && displayText && (
<div
className="absolute bottom-16 left-4 right-4 z-20 rounded-xl p-4"
style={{ backgroundColor: 'rgba(0,0,0,0.75)' }}
onClick={(e) => e.stopPropagation()}
>
{extractedText && translatedText && (
<div className="flex justify-end mb-2">
<button
onClick={() => setShowOriginal((v) => !v)}
className="text-xs px-2 py-0.5 rounded-full"
style={{ backgroundColor: 'rgba(255,255,255,0.15)', color: 'rgba(255,255,255,0.7)' }}
>
{showOriginal ? 'Show Translation' : 'Show Original'}
</button>
</div>
)}
<p className="text-sm whitespace-pre-wrap" style={{ color: 'rgba(255,255,255,0.9)' }}>
{displayText}
</p>
</div>
)}
{/* Bottom bar: mute | filename | action buttons */}
<div className="absolute bottom-0 left-0 right-0 flex items-center gap-3 px-4 pb-3 pt-2 z-10">
<div className="w-9 flex-shrink-0">
{isVideo && (
<button
onClick={() => setLocalMuted((v) => !v)}
className="w-9 h-9 rounded-full flex items-center justify-center text-base transition-opacity hover:opacity-100 opacity-70"
className="w-9 h-9 rounded-full flex items-center justify-center transition-opacity hover:opacity-100 opacity-70"
style={{ backgroundColor: 'rgba(0,0,0,0.5)', color: '#fff' }}
aria-label={localMuted ? 'Unmute' : 'Mute'}
>
{localMuted ? '🔇' : '🔊'}
{localMuted ? (
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5"/>
<line x1="23" y1="9" x2="17" y2="15"/>
<line x1="17" y1="9" x2="23" y2="15"/>
</svg>
) : (
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5"/>
<path d="M19.07 4.93a10 10 0 0 1 0 14.14"/>
<path d="M15.54 8.46a5 5 0 0 1 0 7.07"/>
</svg>
)}
</button>
)}
</div>
<span className="flex-1 text-xs truncate text-center" style={{ color: 'rgba(255,255,255,0.4)' }}>
{current?.name}
</span>
<div className="w-9 flex-shrink-0 flex justify-end">
{isVideo && (
<div className="flex-shrink-0 flex items-center gap-1">
{extractedText && (
<button
onClick={() => setIsPaused((v) => !v)}
className="w-9 h-9 rounded-full flex items-center justify-center text-sm transition-opacity hover:opacity-100 opacity-70"
style={{ backgroundColor: 'rgba(0,0,0,0.5)', color: '#fff' }}
aria-label={isPaused ? 'Play' : 'Pause'}
onClick={() => setShowTextOverlay((v) => !v)}
className="w-9 h-9 rounded-full flex items-center justify-center transition-opacity hover:opacity-100 opacity-70"
style={{
backgroundColor: showTextOverlay ? 'rgba(255,255,255,0.2)' : 'rgba(0,0,0,0.5)',
color: '#fff',
}}
aria-label={showTextOverlay ? 'Hide text' : 'Show text'}
>
{isPaused ? '▶' : '⏸'}
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
<line x1="3" y1="6" x2="21" y2="6"/>
<line x1="3" y1="12" x2="15" y2="12"/>
<line x1="3" y1="18" x2="18" y2="18"/>
</svg>
</button>
)}
{onViewInLibrary && current?.itemKey && (
<button
onClick={(e) => { e.stopPropagation(); onViewInLibrary(current) }}
className="w-9 h-9 rounded-full flex items-center justify-center transition-opacity hover:opacity-100 opacity-70"
style={{ backgroundColor: 'rgba(0,0,0,0.5)', color: '#fff' }}
aria-label="View in library"
>
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
<path d="M3 9l9-7 9 7v11a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2z"/>
<polyline points="9 22 9 12 15 12 15 22"/>
</svg>
</button>
)}
</div>

View File

@@ -16,9 +16,7 @@ interface Props {
export default function ImageLightbox({ url, name, onClose, onPrev, onNext, itemKey, onTagsChanged, onAiTag }: Props) {
const overlayRef = useRef<HTMLDivElement>(null)
const [showTags, setShowTags] = useState(
() => !!itemKey && typeof window !== 'undefined' && window.innerWidth >= 1280
)
const [showTags, setShowTags] = useState(false)
const [aiTagging, setAiTagging] = useState(false)
const [aiTagError, setAiTagError] = useState<string | null>(null)
const [tagRefreshKey, setTagRefreshKey] = useState(0)
@@ -30,9 +28,16 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
const [extractError, setExtractError] = useState<string | null>(null)
const [retranslating, setRetranslating] = useState(false)
// Text overlay state
const [showTextOverlay, setShowTextOverlay] = useState(false)
const [showOriginal, setShowOriginal] = useState(false)
// Determine if this is an image file (for text extraction controls)
const isImage = /\.(jpe?g|png|gif|webp|bmp|tiff?)$/i.test(name)
// Derived: what text to display in the overlay
const displayText = (translatedText && !showOriginal) ? translatedText : extractedText
// Fetch existing AI fields on mount / item change
useEffect(() => {
if (!itemKey) return
@@ -76,6 +81,31 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
{name}
</span>
<div className="flex items-center gap-2 flex-shrink-0">
{/* Text overlay button — only shown when extracted text exists */}
{extractedText && (
<button
onClick={(e) => { e.stopPropagation(); setShowTextOverlay((v) => !v) }}
className="w-12 h-12 rounded-full flex items-center justify-center transition-colors"
style={{
backgroundColor: showTextOverlay ? 'var(--accent)' : 'var(--surface)',
color: showTextOverlay ? '#fff' : 'var(--text-primary)',
}}
onMouseEnter={(e) => {
if (!showTextOverlay) (e.currentTarget as HTMLElement).style.backgroundColor = 'var(--surface-hover)'
}}
onMouseLeave={(e) => {
if (!showTextOverlay) (e.currentTarget as HTMLElement).style.backgroundColor = 'var(--surface)'
}}
aria-label={showTextOverlay ? 'Hide text' : 'Show text'}
title="Display text"
>
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
<line x1="3" y1="6" x2="21" y2="6"/>
<line x1="3" y1="12" x2="15" y2="12"/>
<line x1="3" y1="18" x2="18" y2="18"/>
</svg>
</button>
)}
{itemKey && (
<button
onClick={(e) => { e.stopPropagation(); setShowTags((v) => !v) }}
@@ -179,6 +209,29 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
</button>
)}
{/* Text overlay */}
{showTextOverlay && displayText && (
<div
className="absolute bottom-4 left-4 right-4 z-10 rounded-xl p-4"
style={{ backgroundColor: 'rgba(0,0,0,0.75)' }}
onClick={(e) => e.stopPropagation()}
>
{extractedText && translatedText && (
<div className="flex justify-end mb-2">
<button
onClick={() => setShowOriginal((v) => !v)}
className="text-xs px-2 py-0.5 rounded-full"
style={{ backgroundColor: 'rgba(255,255,255,0.15)', color: 'rgba(255,255,255,0.7)' }}
>
{showOriginal ? 'Show Translation' : 'Show Original'}
</button>
</div>
)}
<p className="text-sm whitespace-pre-wrap" style={{ color: 'rgba(255,255,255,0.9)' }}>
{displayText}
</p>
</div>
)}
</div>
{/* Tag panel */}
<div
@@ -343,6 +396,29 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item
</button>
)}
{/* Text overlay */}
{showTextOverlay && displayText && (
<div
className="absolute bottom-4 left-4 right-4 z-10 rounded-xl p-4"
style={{ backgroundColor: 'rgba(0,0,0,0.75)' }}
onClick={(e) => e.stopPropagation()}
>
{extractedText && translatedText && (
<div className="flex justify-end mb-2">
<button
onClick={() => setShowOriginal((v) => !v)}
className="text-xs px-2 py-0.5 rounded-full"
style={{ backgroundColor: 'rgba(255,255,255,0.15)', color: 'rgba(255,255,255,0.7)' }}
>
{showOriginal ? 'Show Translation' : 'Show Original'}
</button>
</div>
)}
<p className="text-sm whitespace-pre-wrap" style={{ color: 'rgba(255,255,255,0.9)' }}>
{displayText}
</p>
</div>
)}
</div>
)}
</div>

View File

@@ -41,6 +41,7 @@ export default function MixedView({ libraryId, initialPath }: Props) {
const [doomScrollEntries, setDoomScrollEntries] = useState<FileEntry[]>([])
const [doomScrollEntriesLoading, setDoomScrollEntriesLoading] = useState(false)
const [doomScrollEntriesLoaded, setDoomScrollEntriesLoaded] = useState(false)
const [pendingOpen, setPendingOpen] = useState<string | null>(null)
const toggleTag = (tagId: string) =>
setSelectedTagIds((prev) => {
@@ -234,9 +235,39 @@ export default function MixedView({ libraryId, initialPath }: Props) {
// When filters are active, doom scroll uses filteredEntries (already filtered by search/tags).
// When no filters, doom scroll uses files recursively under the current directory.
// In both cases entries come from recursive API calls so entry.name is the full relative path.
const doomScrollItems: DoomScrollItem[] = (filtersActive ? filteredEntries : doomScrollEntries)
.filter((e) => e.type === 'file' && (e.mediaType === 'video' || e.mediaType === 'image') && e.url && isBrowserPlayable(e.name))
.map((e) => ({ url: e.url!, name: e.name, mediaType: e.mediaType as 'video' | 'image' }))
.map((e) => ({
url: e.url!,
name: e.name,
mediaType: e.mediaType as 'video' | 'image',
itemKey: `${libraryId}:mixed_file:${encodeURIComponent(e.name)}`,
}))
const handleViewInLibrary = useCallback((item: DoomScrollItem) => {
if (!item.itemKey) return
const rel = decodeURIComponent(item.itemKey.split(':mixed_file:')[1])
const parts = rel.split('/')
parts.pop()
const dir = parts.join('/')
setDoomScrollActive(false)
setPendingOpen(rel)
loadPath(dir)
}, [loadPath])
// Auto-open a file after navigating to its directory (from "view in library")
useEffect(() => {
if (!pendingOpen || !listing) return
const filename = pendingOpen.split('/').pop()!
const entry = listing.entries.find((e) => e.name === filename && e.type === 'file')
if (!entry) return
setPendingOpen(null)
const idx = mediaEntries.indexOf(entry)
openMediaEntry(entry, idx >= 0 ? idx : 0)
// openMediaEntry is defined inline and depends on stable state; listing/pendingOpen are the real triggers
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [listing, pendingOpen])
return (
<>
@@ -245,6 +276,7 @@ export default function MixedView({ libraryId, initialPath }: Props) {
items={doomScrollItems}
videoContext="mixed"
onClose={() => setDoomScrollActive(false)}
onViewInLibrary={handleViewInLibrary}
/>
)}

View File

@@ -22,9 +22,7 @@ export default function VideoPlayerModal({ url, name, onClose, onPrev, onNext, i
const loop = context === 'mixed' ? settings.mixedLoop : context === 'movies' ? settings.moviesLoop : settings.tvLoop
const muted = context === 'mixed' ? settings.mixedMuted : context === 'movies' ? settings.moviesMuted : settings.tvMuted
const overlayRef = useRef<HTMLDivElement>(null)
const [showTags, setShowTags] = useState(
() => !!itemKey && typeof window !== 'undefined' && window.innerWidth >= 1280
)
const [showTags, setShowTags] = useState(false)
const [aiTagging, setAiTagging] = useState(false)
const [aiTagError, setAiTagError] = useState<string | null>(null)
const [tagRefreshKey, setTagRefreshKey] = useState(0)

View File

@@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types'
import { getDb } from './db'
import { getAiConfig, getEffectiveAiConfig, getPreferredLanguage } from './app-settings'
import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags'
import { getThumbnailPath, getVideoFramePaths } from './thumbnails'
import { getAiImagePath, getVideoFramePaths } from './thumbnails'
import { findFile } from './media-utils'
import { getLibrary, resolveLibraryRoot } from './libraries'
@@ -279,7 +279,7 @@ export async function runAiTagging(library: Library, libraryRoot: string): Promi
const framePaths = await getVideoFramePaths(resolvedMedia.path, library.id, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else {
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, library.id, 'image')
const thumbnailPath = await getAiImagePath(resolvedMedia.path, library.id)
base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
}
@@ -367,7 +367,7 @@ export async function tagSingleItem(itemKey: string): Promise<string[]> {
const framePaths = await getVideoFramePaths(imagePath.path, libraryId, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else {
const thumbnailPath = await getThumbnailPath(imagePath.path, libraryId, 'image')
const thumbnailPath = await getAiImagePath(imagePath.path, libraryId)
base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
}
@@ -529,7 +529,7 @@ export async function generateItemDescription(itemKey: string): Promise<string>
const framePaths = await getVideoFramePaths(resolvedMedia.path, libraryId, VIDEO_FRAME_PERCENTAGES)
base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64'))
} else {
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image')
const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
}
@@ -587,7 +587,7 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText:
throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' })
}
const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image')
const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId)
const base64Images = [fs.readFileSync(thumbnailPath, 'base64')]
const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${config.promptExtract ? ' ' + config.promptExtract : ''} If there is no text in the image, respond with exactly: [NO TEXT]`

View File

@@ -7,6 +7,8 @@ import sharp from 'sharp'
const CACHE_DIR = path.resolve(process.cwd(), '.thumbnails')
const THUMBNAIL_WIDTH = 400
const JPEG_QUALITY = 75
const AI_IMAGE_WIDTH = 1920
const AI_JPEG_QUALITY = 90
/** Ensure the cache directory exists. */
function ensureCacheDir(): void {
@@ -47,6 +49,17 @@ async function generateImageThumbnail(src: string, dest: string): Promise<void>
fs.renameSync(tmp, dest)
}
/** Generate a high-resolution JPEG for AI vision use. Images smaller than
* AI_IMAGE_WIDTH are not upscaled — they are converted at their native size. */
async function generateAiImage(src: string, dest: string): Promise<void> {
const tmp = dest + '.tmp'
await sharp(src)
.resize(AI_IMAGE_WIDTH, undefined, { withoutEnlargement: true })
.jpeg({ quality: AI_JPEG_QUALITY })
.toFile(tmp)
fs.renameSync(tmp, dest)
}
/** Run a child process and collect stderr. Resolves on exit code 0, rejects otherwise. */
function run(bin: string, args: string[]): Promise<void> {
return new Promise((resolve, reject) => {
@@ -158,6 +171,25 @@ export async function getVideoFramePaths(
return framePaths
}
/**
* Returns the absolute path to a high-resolution JPEG suitable for AI vision
* APIs (1920px wide max, quality 90). Cached alongside display thumbnails with
* an `_ai` suffix so display performance is unaffected.
* Generates on first call or when the source file has been modified.
*/
export async function getAiImagePath(
absoluteFilePath: string,
libraryId: string
): Promise<string> {
ensureCacheDir()
const key = cacheKey(libraryId, absoluteFilePath)
const cacheFile = path.join(CACHE_DIR, key + '_ai.jpg')
const cached = getCachedPath(cacheFile, absoluteFilePath)
if (cached) return cached
await generateAiImage(absoluteFilePath, cacheFile)
return cacheFile
}
/**
* Returns the absolute path to a cached thumbnail JPEG for the given file.
* Generates it on first call (or when the source has been modified).