From 5ba73b2e560ebcd9605ebf2ba4e466d9d27e4300 Mon Sep 17 00:00:00 2001 From: Garret Patti <42485635+garretpatti@users.noreply.github.com> Date: Mon, 13 Apr 2026 08:16:34 -0400 Subject: [PATCH 1/3] doom scroll and viewer improvements - move play/pause to clicking the video directly; remove dedicated button - replace emoji mute icons with flat minimal SVGs - add view-in-library button in doom scroll that navigates to the file's directory and opens it in the regular viewer - add display text overlay button in doom scroll and image lightbox; shows extracted text (translated by default when available) in a semi-transparent box at the bottom; toggle between translated/original - hide tag panel by default in image lightbox and video player modal Co-Authored-By: Claude Sonnet 4.6 --- src/components/DoomScrollView.tsx | 112 +++++++++++++++++++--- src/components/mixed/ImageLightbox.tsx | 82 +++++++++++++++- src/components/mixed/MixedView.tsx | 34 ++++++- src/components/mixed/VideoPlayerModal.tsx | 4 +- 4 files changed, 212 insertions(+), 20 deletions(-) diff --git a/src/components/DoomScrollView.tsx b/src/components/DoomScrollView.tsx index 4de737d..f6d6734 100644 --- a/src/components/DoomScrollView.tsx +++ b/src/components/DoomScrollView.tsx @@ -14,6 +14,7 @@ interface Props { items: DoomScrollItem[] videoContext?: 'mixed' | 'movies' | 'tv' onClose: () => void + onViewInLibrary?: (item: DoomScrollItem) => void } const HISTORY_CAP = 100 @@ -26,7 +27,7 @@ function pickRandom(items: DoomScrollItem[], excludeRecent: DoomScrollItem[]): D return pool[Math.floor(Math.random() * pool.length)] } -export default function DoomScrollView({ items, videoContext = 'mixed', onClose }: Props) { +export default function DoomScrollView({ items, videoContext = 'mixed', onClose, onViewInLibrary }: Props) { const settings = useUserSettings() const settingsMuted = videoContext === 'mixed' ? settings.mixedMuted : videoContext === 'movies' ? settings.moviesMuted : settings.tvMuted @@ -40,6 +41,12 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose const [autoPlayEnabled, setAutoPlayEnabled] = useState(false) const [autoPlaySeconds, setAutoPlaySeconds] = useState(5) + // Text overlay state + const [extractedText, setExtractedText] = useState(null) + const [translatedText, setTranslatedText] = useState(null) + const [showTextOverlay, setShowTextOverlay] = useState(false) + const [showOriginal, setShowOriginal] = useState(false) + const videoRef = useRef(null) const cooldownRef = useRef(false) const touchStartY = useRef(null) @@ -48,6 +55,9 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose const isVideo = current?.mediaType === 'video' const backCount = history.length - 1 - historyIndex + // Derived: what text to display in the overlay + const displayText = (translatedText && !showOriginal) ? translatedText : extractedText + const goNext = useCallback(() => { if (items.length === 0) return setHistoryIndex((idx) => { @@ -114,11 +124,30 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose return () => clearTimeout(id) }, [autoPlayEnabled, isPaused, autoPlaySeconds, current?.url, goNext]) + // Fetch extracted text for current item + useEffect(() => { + setExtractedText(null) + setTranslatedText(null) + setShowTextOverlay(false) + setShowOriginal(false) + if (!current?.itemKey) return + fetch(`/api/ai-tagging/fields?itemKey=${encodeURIComponent(current.itemKey)}`) + .then((r) => r.json()) + .then((data: { extractedText: string | null; extractedTextTranslated: string | null }) => { + setExtractedText(data.extractedText) + setTranslatedText(data.extractedTextTranslated) + }) + .catch(() => {}) + }, [current?.itemKey]) + useEffect(() => { const handleKey = (e: KeyboardEvent) => { if (e.key === 'Escape') { onClose(); return } if (e.key === 'ArrowDown' || e.key === ' ' || e.key === 'PageDown') { e.preventDefault(); navigate('next') } if (e.key === 'ArrowUp' || e.key === 'PageUp') { e.preventDefault(); navigate('prev') } + if (e.key === 't' || e.key === 'T') { + if (extractedText) setShowTextOverlay((v) => !v) + } } const handleWheel = (e: WheelEvent) => { e.preventDefault() @@ -147,7 +176,7 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose document.removeEventListener('touchend', handleTouchEnd) document.body.style.overflow = '' } - }, [navigate, onClose]) + }, [navigate, onClose, extractedText]) return (
@@ -219,8 +248,9 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose loop={!autoPlayEnabled} muted={localMuted} playsInline - className="max-w-full max-h-full object-contain" + className="max-w-full max-h-full object-contain cursor-pointer" style={{ backgroundColor: '#000' }} + onClick={() => setIsPaused((v) => !v)} /> ) : current?.mediaType === 'image' ? ( // eslint-disable-next-line @next/next/no-img-element @@ -233,32 +263,88 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose ) : null}
- {/* Bottom bar: mute | filename | play-pause */} + {/* Text overlay */} + {showTextOverlay && displayText && ( +
e.stopPropagation()} + > + {extractedText && translatedText && ( +
+ +
+ )} +

+ {displayText} +

+
+ )} + + {/* Bottom bar: mute | filename | action buttons */}
{isVideo && ( )}
{current?.name} -
- {isVideo && ( +
+ {extractedText && ( + )} + {onViewInLibrary && current?.itemKey && ( + )}
diff --git a/src/components/mixed/ImageLightbox.tsx b/src/components/mixed/ImageLightbox.tsx index f3c413e..c49c52e 100644 --- a/src/components/mixed/ImageLightbox.tsx +++ b/src/components/mixed/ImageLightbox.tsx @@ -16,9 +16,7 @@ interface Props { export default function ImageLightbox({ url, name, onClose, onPrev, onNext, itemKey, onTagsChanged, onAiTag }: Props) { const overlayRef = useRef(null) - const [showTags, setShowTags] = useState( - () => !!itemKey && typeof window !== 'undefined' && window.innerWidth >= 1280 - ) + const [showTags, setShowTags] = useState(false) const [aiTagging, setAiTagging] = useState(false) const [aiTagError, setAiTagError] = useState(null) const [tagRefreshKey, setTagRefreshKey] = useState(0) @@ -30,9 +28,16 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item const [extractError, setExtractError] = useState(null) const [retranslating, setRetranslating] = useState(false) + // Text overlay state + const [showTextOverlay, setShowTextOverlay] = useState(false) + const [showOriginal, setShowOriginal] = useState(false) + // Determine if this is an image file (for text extraction controls) const isImage = /\.(jpe?g|png|gif|webp|bmp|tiff?)$/i.test(name) + // Derived: what text to display in the overlay + const displayText = (translatedText && !showOriginal) ? translatedText : extractedText + // Fetch existing AI fields on mount / item change useEffect(() => { if (!itemKey) return @@ -76,6 +81,31 @@ export default function ImageLightbox({ url, name, onClose, onPrev, onNext, item {name}
+ {/* Text overlay button — only shown when extracted text exists */} + {extractedText && ( + + )} {itemKey && ( )} + {/* Text overlay */} + {showTextOverlay && displayText && ( +
e.stopPropagation()} + > + {extractedText && translatedText && ( +
+ +
+ )} +

+ {displayText} +

+
+ )}
{/* Tag panel */}
)} + {/* Text overlay */} + {showTextOverlay && displayText && ( +
e.stopPropagation()} + > + {extractedText && translatedText && ( +
+ +
+ )} +

+ {displayText} +

+
+ )}
)}
diff --git a/src/components/mixed/MixedView.tsx b/src/components/mixed/MixedView.tsx index bfae948..de652f9 100644 --- a/src/components/mixed/MixedView.tsx +++ b/src/components/mixed/MixedView.tsx @@ -41,6 +41,7 @@ export default function MixedView({ libraryId, initialPath }: Props) { const [doomScrollEntries, setDoomScrollEntries] = useState([]) const [doomScrollEntriesLoading, setDoomScrollEntriesLoading] = useState(false) const [doomScrollEntriesLoaded, setDoomScrollEntriesLoaded] = useState(false) + const [pendingOpen, setPendingOpen] = useState(null) const toggleTag = (tagId: string) => setSelectedTagIds((prev) => { @@ -234,9 +235,39 @@ export default function MixedView({ libraryId, initialPath }: Props) { // When filters are active, doom scroll uses filteredEntries (already filtered by search/tags). // When no filters, doom scroll uses files recursively under the current directory. + // In both cases entries come from recursive API calls so entry.name is the full relative path. const doomScrollItems: DoomScrollItem[] = (filtersActive ? filteredEntries : doomScrollEntries) .filter((e) => e.type === 'file' && (e.mediaType === 'video' || e.mediaType === 'image') && e.url && isBrowserPlayable(e.name)) - .map((e) => ({ url: e.url!, name: e.name, mediaType: e.mediaType as 'video' | 'image' })) + .map((e) => ({ + url: e.url!, + name: e.name, + mediaType: e.mediaType as 'video' | 'image', + itemKey: `${libraryId}:mixed_file:${encodeURIComponent(e.name)}`, + })) + + const handleViewInLibrary = useCallback((item: DoomScrollItem) => { + if (!item.itemKey) return + const rel = decodeURIComponent(item.itemKey.split(':mixed_file:')[1]) + const parts = rel.split('/') + parts.pop() + const dir = parts.join('/') + setDoomScrollActive(false) + setPendingOpen(rel) + loadPath(dir) + }, [loadPath]) + + // Auto-open a file after navigating to its directory (from "view in library") + useEffect(() => { + if (!pendingOpen || !listing) return + const filename = pendingOpen.split('/').pop()! + const entry = listing.entries.find((e) => e.name === filename && e.type === 'file') + if (!entry) return + setPendingOpen(null) + const idx = mediaEntries.indexOf(entry) + openMediaEntry(entry, idx >= 0 ? idx : 0) + // openMediaEntry is defined inline and depends on stable state; listing/pendingOpen are the real triggers + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [listing, pendingOpen]) return ( <> @@ -245,6 +276,7 @@ export default function MixedView({ libraryId, initialPath }: Props) { items={doomScrollItems} videoContext="mixed" onClose={() => setDoomScrollActive(false)} + onViewInLibrary={handleViewInLibrary} /> )} diff --git a/src/components/mixed/VideoPlayerModal.tsx b/src/components/mixed/VideoPlayerModal.tsx index b1a1031..fc01156 100644 --- a/src/components/mixed/VideoPlayerModal.tsx +++ b/src/components/mixed/VideoPlayerModal.tsx @@ -22,9 +22,7 @@ export default function VideoPlayerModal({ url, name, onClose, onPrev, onNext, i const loop = context === 'mixed' ? settings.mixedLoop : context === 'movies' ? settings.moviesLoop : settings.tvLoop const muted = context === 'mixed' ? settings.mixedMuted : context === 'movies' ? settings.moviesMuted : settings.tvMuted const overlayRef = useRef(null) - const [showTags, setShowTags] = useState( - () => !!itemKey && typeof window !== 'undefined' && window.innerWidth >= 1280 - ) + const [showTags, setShowTags] = useState(false) const [aiTagging, setAiTagging] = useState(false) const [aiTagError, setAiTagError] = useState(null) const [tagRefreshKey, setTagRefreshKey] = useState(0) -- 2.49.1 From cd9a83ea90f18bdd2de1f3b3a7e7ec8b0da89b43 Mon Sep 17 00:00:00 2001 From: Garret Patti <42485635+garretpatti@users.noreply.github.com> Date: Mon, 13 Apr 2026 09:08:43 -0400 Subject: [PATCH 2/3] send higher resolution images to AI vision endpoints Add getAiImagePath() to thumbnails.ts (1920px wide, quality 90, no upscaling) cached separately from display thumbnails via an _ai suffix. Swap all four image-to-AI code paths in ai-tagger.ts (extract text, describe, batch tagging x2) to use the new high-res image instead of the 400px display thumbnail, improving OCR accuracy on dense text. Co-Authored-By: Claude Sonnet 4.6 --- src/lib/ai-tagger.ts | 10 +++++----- src/lib/thumbnails.ts | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/lib/ai-tagger.ts b/src/lib/ai-tagger.ts index fc04423..da2b951 100644 --- a/src/lib/ai-tagger.ts +++ b/src/lib/ai-tagger.ts @@ -4,7 +4,7 @@ import type { Library, Tag, TagCategory } from '@/types' import { getDb } from './db' import { getAiConfig, getEffectiveAiConfig, getPreferredLanguage } from './app-settings' import { getTags, getCategories, addTagToItem, getActiveCategoryIdsForLibrary, getResolvedTagsForItem } from './tags' -import { getThumbnailPath, getVideoFramePaths } from './thumbnails' +import { getAiImagePath, getVideoFramePaths } from './thumbnails' import { findFile } from './media-utils' import { getLibrary, resolveLibraryRoot } from './libraries' @@ -279,7 +279,7 @@ export async function runAiTagging(library: Library, libraryRoot: string): Promi const framePaths = await getVideoFramePaths(resolvedMedia.path, library.id, VIDEO_FRAME_PERCENTAGES) base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64')) } else { - const thumbnailPath = await getThumbnailPath(resolvedMedia.path, library.id, 'image') + const thumbnailPath = await getAiImagePath(resolvedMedia.path, library.id) base64Images = [fs.readFileSync(thumbnailPath, 'base64')] } @@ -367,7 +367,7 @@ export async function tagSingleItem(itemKey: string): Promise { const framePaths = await getVideoFramePaths(imagePath.path, libraryId, VIDEO_FRAME_PERCENTAGES) base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64')) } else { - const thumbnailPath = await getThumbnailPath(imagePath.path, libraryId, 'image') + const thumbnailPath = await getAiImagePath(imagePath.path, libraryId) base64Images = [fs.readFileSync(thumbnailPath, 'base64')] } @@ -529,7 +529,7 @@ export async function generateItemDescription(itemKey: string): Promise const framePaths = await getVideoFramePaths(resolvedMedia.path, libraryId, VIDEO_FRAME_PERCENTAGES) base64Images = framePaths.map((p) => fs.readFileSync(p, 'base64')) } else { - const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image') + const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId) base64Images = [fs.readFileSync(thumbnailPath, 'base64')] } @@ -587,7 +587,7 @@ export async function extractItemText(itemKey: string): Promise<{ extractedText: throw Object.assign(new Error('Text extraction is only available for images'), { code: 'NO_IMAGE' }) } - const thumbnailPath = await getThumbnailPath(resolvedMedia.path, libraryId, 'image') + const thumbnailPath = await getAiImagePath(resolvedMedia.path, libraryId) const base64Images = [fs.readFileSync(thumbnailPath, 'base64')] const systemPrompt = `You are an OCR assistant. Extract ALL text visible in the image exactly as it appears. Preserve line breaks and formatting.${config.promptExtract ? ' ' + config.promptExtract : ''} If there is no text in the image, respond with exactly: [NO TEXT]` diff --git a/src/lib/thumbnails.ts b/src/lib/thumbnails.ts index 04f80e1..144a446 100644 --- a/src/lib/thumbnails.ts +++ b/src/lib/thumbnails.ts @@ -7,6 +7,8 @@ import sharp from 'sharp' const CACHE_DIR = path.resolve(process.cwd(), '.thumbnails') const THUMBNAIL_WIDTH = 400 const JPEG_QUALITY = 75 +const AI_IMAGE_WIDTH = 1920 +const AI_JPEG_QUALITY = 90 /** Ensure the cache directory exists. */ function ensureCacheDir(): void { @@ -47,6 +49,17 @@ async function generateImageThumbnail(src: string, dest: string): Promise fs.renameSync(tmp, dest) } +/** Generate a high-resolution JPEG for AI vision use. Images smaller than + * AI_IMAGE_WIDTH are not upscaled — they are converted at their native size. */ +async function generateAiImage(src: string, dest: string): Promise { + const tmp = dest + '.tmp' + await sharp(src) + .resize(AI_IMAGE_WIDTH, undefined, { withoutEnlargement: true }) + .jpeg({ quality: AI_JPEG_QUALITY }) + .toFile(tmp) + fs.renameSync(tmp, dest) +} + /** Run a child process and collect stderr. Resolves on exit code 0, rejects otherwise. */ function run(bin: string, args: string[]): Promise { return new Promise((resolve, reject) => { @@ -158,6 +171,25 @@ export async function getVideoFramePaths( return framePaths } +/** + * Returns the absolute path to a high-resolution JPEG suitable for AI vision + * APIs (1920px wide max, quality 90). Cached alongside display thumbnails with + * an `_ai` suffix so display performance is unaffected. + * Generates on first call or when the source file has been modified. + */ +export async function getAiImagePath( + absoluteFilePath: string, + libraryId: string +): Promise { + ensureCacheDir() + const key = cacheKey(libraryId, absoluteFilePath) + const cacheFile = path.join(CACHE_DIR, key + '_ai.jpg') + const cached = getCachedPath(cacheFile, absoluteFilePath) + if (cached) return cached + await generateAiImage(absoluteFilePath, cacheFile) + return cacheFile +} + /** * Returns the absolute path to a cached thumbnail JPEG for the given file. * Generates it on first call (or when the source has been modified). -- 2.49.1 From b0fc275a52658cacf15d1c704ad89af5592ee772 Mon Sep 17 00:00:00 2001 From: Garret Patti <42485635+garretpatti@users.noreply.github.com> Date: Mon, 13 Apr 2026 09:19:32 -0400 Subject: [PATCH 3/3] add extract text button to doom scroll mode Show an extract-text button (document icon) in the bottom bar when the current image has no extracted text yet. Clicking it calls the extract-text API, shows a spinner while in progress, and on success replaces itself with the text-lines display button and auto-opens the overlay. Error state briefly turns the button red. Resets on every item navigation alongside the other text state. Hidden for videos and items without an itemKey. Co-Authored-By: Claude Sonnet 4.6 --- src/components/DoomScrollView.tsx | 57 +++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/src/components/DoomScrollView.tsx b/src/components/DoomScrollView.tsx index f6d6734..04972d9 100644 --- a/src/components/DoomScrollView.tsx +++ b/src/components/DoomScrollView.tsx @@ -46,6 +46,8 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose, const [translatedText, setTranslatedText] = useState(null) const [showTextOverlay, setShowTextOverlay] = useState(false) const [showOriginal, setShowOriginal] = useState(false) + const [extracting, setExtracting] = useState(false) + const [extractError, setExtractError] = useState(null) const videoRef = useRef(null) const cooldownRef = useRef(false) @@ -130,6 +132,8 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose, setTranslatedText(null) setShowTextOverlay(false) setShowOriginal(false) + setExtracting(false) + setExtractError(null) if (!current?.itemKey) return fetch(`/api/ai-tagging/fields?itemKey=${encodeURIComponent(current.itemKey)}`) .then((r) => r.json()) @@ -178,6 +182,32 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose, } }, [navigate, onClose, extractedText]) + const handleExtractText = async () => { + if (!current?.itemKey) return + setExtracting(true) + setExtractError(null) + try { + const res = await fetch('/api/ai-tagging/extract-text', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ itemKey: current.itemKey }), + }) + if (!res.ok) { + const data = await res.json().catch(() => ({})) + throw new Error((data as { error?: string }).error ?? 'Extraction failed') + } + const result = await res.json() + setExtractedText(result.extractedText || null) + setTranslatedText(result.translatedText || null) + if (result.extractedText) setShowTextOverlay(true) + } catch (err) { + setExtractError(err instanceof Error ? err.message : 'Extraction failed') + setTimeout(() => setExtractError(null), 4000) + } finally { + setExtracting(false) + } + } + return (
{/* Keyframe for auto-play progress bar */} @@ -317,7 +347,7 @@ export default function DoomScrollView({ items, videoContext = 'mixed', onClose, {current?.name}
- {extractedText && ( + {extractedText ? ( - )} + ) : current?.itemKey && current?.mediaType === 'image' ? ( + + ) : null} {onViewInLibrary && current?.itemKey && (