From 38a68868631cff74591cb3f7882e357b354b5158 Mon Sep 17 00:00:00 2001 From: Garret Patti <42485635+garretpatti@users.noreply.github.com> Date: Mon, 6 Apr 2026 18:35:02 -0400 Subject: [PATCH] Add file fingerprinting for move-resilient media item identity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Computes a SHA-256 partial-content fingerprint (file size + first 64 KB) for movies, TV episodes, and mixed files during scans. When a file is moved or renamed within a library, the scan detects the fingerprint match, renames the media_items row in-place, and updates media_tags.media_key to match — so tags and NFO metadata survive the move transparently. - src/lib/fingerprint.ts: new computeFingerprint() using sync FS reads - src/lib/db.ts: fingerprint TEXT column + index migration - src/lib/tags.ts: reKeyMediaItem() to update media_tags on rename - src/lib/scanner.ts: replace clear+upsert with detectMoves/reconcileAndPrune for movies, TV episodes, and mixed files; games retain clear+upsert (v1) - TV scan restructured to a single filesystem pass (no double-scanning) Co-Authored-By: Claude Sonnet 4.6 --- src/lib/db.ts | 15 ++ src/lib/fingerprint.ts | 36 +++++ src/lib/scanner.ts | 329 +++++++++++++++++++++++++++++------------ src/lib/tags.ts | 6 + 4 files changed, 290 insertions(+), 96 deletions(-) create mode 100644 src/lib/fingerprint.ts diff --git a/src/lib/db.ts b/src/lib/db.ts index ef42ed9..54fd02b 100644 --- a/src/lib/db.ts +++ b/src/lib/db.ts @@ -89,15 +89,18 @@ function initDb(db: Database.Database): void { genres TEXT, metadata TEXT, file_path TEXT, + fingerprint TEXT, scanned_at INTEGER NOT NULL ); CREATE INDEX IF NOT EXISTS media_items_library_id ON media_items(library_id); CREATE INDEX IF NOT EXISTS media_items_parent_key ON media_items(parent_key); + CREATE INDEX IF NOT EXISTS media_items_fingerprint ON media_items(fingerprint); `) migrateLibrariesType(db) migrateMediaItemsSchema(db) + migrateMediaItemsFingerprint(db) seedAppSettings(db) } @@ -162,6 +165,18 @@ function migrateMediaItemsSchema(db: Database.Database): void { `) } +function migrateMediaItemsFingerprint(db: Database.Database): void { + const row = db + .prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='media_items'") + .get() as { sql: string } | undefined + if (row && !row.sql.includes('fingerprint')) { + db.exec(` + ALTER TABLE media_items ADD COLUMN fingerprint TEXT; + CREATE INDEX IF NOT EXISTS media_items_fingerprint ON media_items(fingerprint); + `) + } +} + function migrateLibrariesType(db: Database.Database): void { const row = db .prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='libraries'") diff --git a/src/lib/fingerprint.ts b/src/lib/fingerprint.ts new file mode 100644 index 0000000..538c145 --- /dev/null +++ b/src/lib/fingerprint.ts @@ -0,0 +1,36 @@ +import fs from 'fs' +import crypto from 'crypto' + +const CHUNK_SIZE = 64 * 1024 // 64 KB + +/** + * Computes a stable partial-content fingerprint for a file. + * Uses SHA-256 of the file size + first 64 KB of content. + * Fast enough for large video files (~instant) and collision-resistant + * for real-world media libraries. + * + * Returns null if the file cannot be read (missing, permission error, etc.). + */ +export function computeFingerprint(absolutePath: string): string | null { + try { + const stat = fs.statSync(absolutePath) + const size = stat.size + const chunkLen = Math.min(CHUNK_SIZE, size) + const buf = Buffer.alloc(chunkLen) + if (chunkLen > 0) { + const fd = fs.openSync(absolutePath, 'r') + try { + fs.readSync(fd, buf, 0, chunkLen, 0) + } finally { + fs.closeSync(fd) + } + } + return crypto + .createHash('sha256') + .update(`${size}:`) + .update(buf) + .digest('hex') + } catch { + return null + } +} diff --git a/src/lib/scanner.ts b/src/lib/scanner.ts index e4ea242..64b45a0 100644 --- a/src/lib/scanner.ts +++ b/src/lib/scanner.ts @@ -8,6 +8,8 @@ import { scanMoviesLibrary } from './movies' import { scanTvLibrary, scanTvSeasons, scanTvEpisodes } from './tv' import { scanGamesLibrary } from './games' import { getThumbnailPath } from './thumbnails' +import { computeFingerprint } from './fingerprint' +import { reKeyMediaItem } from './tags' import { VIDEO_EXTENSIONS } from './media-utils' const IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']) @@ -68,23 +70,38 @@ async function scanMovies(library: Library, libraryRoot: string): Promise const db = getDb() const now = Date.now() - clearLibraryItems(db, library.id) - - const upsert = db.prepare(` - INSERT INTO media_items (library_id, item_key, item_type, title, year, plot, genres, metadata, file_path, scanned_at) - VALUES (@library_id, @item_key, @item_type, @title, @year, @plot, @genres, @metadata, @file_path, @scanned_at) - ON CONFLICT(item_key) DO UPDATE SET - title = excluded.title, - year = excluded.year, - plot = excluded.plot, - genres = excluded.genres, - metadata = excluded.metadata, - file_path = excluded.file_path, - scanned_at = excluded.scanned_at - `) - + // Build new items map: item_key → { fingerprint, movie } + type MovieEntry = { fingerprint: string | null; movie: Movie } + const newItems = new Map() for (const movie of movies) { const itemKey = `${library.id}:movie:${movie.id}` + const fingerprint = movie.videoPath + ? computeFingerprint(path.join(libraryRoot, movie.videoPath)) + : null + newItems.set(itemKey, { fingerprint, movie }) + } + + // Detect moves using fingerprints + const moves = detectMoves(db, library.id, newItems) + + // Apply renames + prune stale rows + reconcileAndPrune(db, library.id, new Set(newItems.keys()), moves) + + const upsert = db.prepare(` + INSERT INTO media_items (library_id, item_key, item_type, title, year, plot, genres, metadata, file_path, fingerprint, scanned_at) + VALUES (@library_id, @item_key, @item_type, @title, @year, @plot, @genres, @metadata, @file_path, @fingerprint, @scanned_at) + ON CONFLICT(item_key) DO UPDATE SET + title = excluded.title, + year = excluded.year, + plot = excluded.plot, + genres = excluded.genres, + metadata = excluded.metadata, + file_path = excluded.file_path, + fingerprint = excluded.fingerprint, + scanned_at = excluded.scanned_at + `) + + for (const [itemKey, { fingerprint, movie }] of newItems) { upsert.run({ library_id: library.id, item_key: itemKey, @@ -100,10 +117,10 @@ async function scanMovies(library: Library, libraryRoot: string): Promise backdropUrl: movie.backdropUrl, }), file_path: movie.videoPath, + fingerprint, scanned_at: now, }) - // Pre-generate poster thumbnail if (movie.posterUrl) { await prewarmThumbnailFromUrl(movie.posterUrl, library.id, libraryRoot, 'image') } @@ -117,43 +134,80 @@ async function scanMovies(library: Library, libraryRoot: string): Promise // --------------------------------------------------------------------------- async function scanTv(library: Library, libraryRoot: string): Promise { - const series = scanTvLibrary(libraryRoot, library.id) const db = getDb() const now = Date.now() - clearLibraryItems(db, library.id) + // Single filesystem pass — collect everything before touching the DB + type SeasonRow = { season: TvSeason; seasonKey: string; episodes: EpisodeRow[] } + type EpisodeRow = { episode: TvEpisode; episodeKey: string; fingerprint: string | null } + type SeriesRow = { show: TvSeries; seriesKey: string; seasons: SeasonRow[] } + + const allSeries: SeriesRow[] = [] + const newKeys = new Set() + const newEpisodes = new Map() + + for (const show of scanTvLibrary(libraryRoot, library.id)) { + const seriesKey = `${library.id}:tv_series:${show.id}` + newKeys.add(seriesKey) + + const seasonRows: SeasonRow[] = [] + for (const season of scanTvSeasons(libraryRoot, library.id, show.id)) { + const seasonKey = `${library.id}:tv_season:${show.id}:${season.id}` + newKeys.add(seasonKey) + + const episodeRows: EpisodeRow[] = [] + for (const episode of scanTvEpisodes(libraryRoot, library.id, show.id, season.id)) { + const episodeKey = `${library.id}:tv_episode:${show.id}:${season.id}:${episode.id}` + newKeys.add(episodeKey) + const fingerprint = episode.videoPath + ? computeFingerprint(path.join(libraryRoot, episode.videoPath)) + : null + episodeRows.push({ episode, episodeKey, fingerprint }) + newEpisodes.set(episodeKey, { fingerprint }) + } + seasonRows.push({ season, seasonKey, episodes: episodeRows }) + } + allSeries.push({ show, seriesKey, seasons: seasonRows }) + } + + // Detect moves among episodes (only episodes have fingerprints) + const moves = detectMoves(db, library.id, newEpisodes) + + // Apply renames + prune stale rows (series, seasons, and episodes) + reconcileAndPrune(db, library.id, newKeys, moves) const upsertSeries = db.prepare(` - INSERT INTO media_items (library_id, item_key, item_type, title, year, plot, genres, metadata, file_path, scanned_at) - VALUES (@library_id, @item_key, @item_type, @title, @year, @plot, @genres, @metadata, @file_path, @scanned_at) + INSERT INTO media_items (library_id, item_key, item_type, title, year, plot, genres, metadata, file_path, fingerprint, scanned_at) + VALUES (@library_id, @item_key, @item_type, @title, @year, @plot, @genres, @metadata, @file_path, @fingerprint, @scanned_at) ON CONFLICT(item_key) DO UPDATE SET - title = excluded.title, - year = excluded.year, - plot = excluded.plot, - genres = excluded.genres, - metadata = excluded.metadata, - file_path = excluded.file_path, - scanned_at = excluded.scanned_at + title = excluded.title, + year = excluded.year, + plot = excluded.plot, + genres = excluded.genres, + metadata = excluded.metadata, + file_path = excluded.file_path, + fingerprint = excluded.fingerprint, + scanned_at = excluded.scanned_at `) const upsertChild = db.prepare(` - INSERT INTO media_items (library_id, item_key, item_type, parent_key, title, year, plot, genres, metadata, file_path, scanned_at) - VALUES (@library_id, @item_key, @item_type, @parent_key, @title, @year, @plot, @genres, @metadata, @file_path, @scanned_at) + INSERT INTO media_items (library_id, item_key, item_type, parent_key, title, year, plot, genres, metadata, file_path, fingerprint, scanned_at) + VALUES (@library_id, @item_key, @item_type, @parent_key, @title, @year, @plot, @genres, @metadata, @file_path, @fingerprint, @scanned_at) ON CONFLICT(item_key) DO UPDATE SET - parent_key = excluded.parent_key, - title = excluded.title, - year = excluded.year, - plot = excluded.plot, - genres = excluded.genres, - metadata = excluded.metadata, - file_path = excluded.file_path, - scanned_at = excluded.scanned_at + parent_key = excluded.parent_key, + title = excluded.title, + year = excluded.year, + plot = excluded.plot, + genres = excluded.genres, + metadata = excluded.metadata, + file_path = excluded.file_path, + fingerprint = excluded.fingerprint, + scanned_at = excluded.scanned_at `) let episodeCount = 0 - for (const show of series) { - const seriesKey = `${library.id}:tv_series:${show.id}` + for (const { show, seriesKey, seasons } of allSeries) { upsertSeries.run({ library_id: library.id, item_key: seriesKey, @@ -169,6 +223,7 @@ async function scanTv(library: Library, libraryRoot: string): Promise { backdropUrl: show.backdropUrl, }), file_path: null, + fingerprint: null, scanned_at: now, }) @@ -176,9 +231,7 @@ async function scanTv(library: Library, libraryRoot: string): Promise { await prewarmThumbnailFromUrl(show.posterUrl, library.id, libraryRoot, 'image') } - const seasons = scanTvSeasons(libraryRoot, library.id, show.id) - for (const season of seasons) { - const seasonKey = `${library.id}:tv_season:${show.id}:${season.id}` + for (const { season, seasonKey, episodes } of seasons) { upsertChild.run({ library_id: library.id, item_key: seasonKey, @@ -194,6 +247,7 @@ async function scanTv(library: Library, libraryRoot: string): Promise { posterUrl: season.posterUrl, }), file_path: null, + fingerprint: null, scanned_at: now, }) @@ -201,9 +255,7 @@ async function scanTv(library: Library, libraryRoot: string): Promise { await prewarmThumbnailFromUrl(season.posterUrl, library.id, libraryRoot, 'image') } - const episodes = scanTvEpisodes(libraryRoot, library.id, show.id, season.id) - for (const episode of episodes) { - const episodeKey = `${library.id}:tv_episode:${show.id}:${season.id}:${episode.id}` + for (const { episode, episodeKey, fingerprint } of episodes) { upsertChild.run({ library_id: library.id, item_key: episodeKey, @@ -221,10 +273,10 @@ async function scanTv(library: Library, libraryRoot: string): Promise { thumbnailUrl: episode.thumbnailUrl, }), file_path: episode.videoPath, + fingerprint, scanned_at: now, }) - // Pre-generate video thumbnail (seek-based frame extraction) const videoAbsPath = path.join(libraryRoot, episode.videoPath) try { await getThumbnailPath(videoAbsPath, library.id, 'video') @@ -236,11 +288,11 @@ async function scanTv(library: Library, libraryRoot: string): Promise { } } - console.log(`[scanner] tv: indexed ${series.length} series, ${episodeCount} episodes`) + console.log(`[scanner] tv: indexed ${allSeries.length} series, ${episodeCount} episodes`) } // --------------------------------------------------------------------------- -// Games +// Games (v1: no fingerprinting — clear+upsert pattern retained) // --------------------------------------------------------------------------- async function scanGames(library: Library, libraryRoot: string): Promise { @@ -251,31 +303,32 @@ async function scanGames(library: Library, libraryRoot: string): Promise { clearLibraryItems(db, library.id) const upsertGame = db.prepare(` - INSERT INTO media_items (library_id, item_key, item_type, title, metadata, file_path, scanned_at) - VALUES (@library_id, @item_key, @item_type, @title, @metadata, @file_path, @scanned_at) + INSERT INTO media_items (library_id, item_key, item_type, title, metadata, file_path, fingerprint, scanned_at) + VALUES (@library_id, @item_key, @item_type, @title, @metadata, @file_path, @fingerprint, @scanned_at) ON CONFLICT(item_key) DO UPDATE SET - title = excluded.title, - metadata = excluded.metadata, - file_path = excluded.file_path, - scanned_at = excluded.scanned_at + title = excluded.title, + metadata = excluded.metadata, + file_path = excluded.file_path, + fingerprint = excluded.fingerprint, + scanned_at = excluded.scanned_at `) const upsertChildGame = db.prepare(` - INSERT INTO media_items (library_id, item_key, item_type, parent_key, title, metadata, file_path, scanned_at) - VALUES (@library_id, @item_key, @item_type, @parent_key, @title, @metadata, @file_path, @scanned_at) + INSERT INTO media_items (library_id, item_key, item_type, parent_key, title, metadata, file_path, fingerprint, scanned_at) + VALUES (@library_id, @item_key, @item_type, @parent_key, @title, @metadata, @file_path, @fingerprint, @scanned_at) ON CONFLICT(item_key) DO UPDATE SET - parent_key = excluded.parent_key, - title = excluded.title, - metadata = excluded.metadata, - file_path = excluded.file_path, - scanned_at = excluded.scanned_at + parent_key = excluded.parent_key, + title = excluded.title, + metadata = excluded.metadata, + file_path = excluded.file_path, + fingerprint = excluded.fingerprint, + scanned_at = excluded.scanned_at `) let gameCount = 0 for (const item of items) { if ('games' in item) { - // GameSeries const series = item as GameSeries const seriesKey = `${library.id}:game_series:${series.id}` upsertGame.run({ @@ -289,6 +342,7 @@ async function scanGames(library: Library, libraryRoot: string): Promise { wideCoverUrl: series.wideCoverUrl, }), file_path: null, + fingerprint: null, scanned_at: now, }) @@ -310,6 +364,7 @@ async function scanGames(library: Library, libraryRoot: string): Promise { wideCoverUrl: game.wideCoverUrl, }), file_path: null, + fingerprint: null, scanned_at: now, }) @@ -319,7 +374,6 @@ async function scanGames(library: Library, libraryRoot: string): Promise { gameCount++ } } else { - // Standalone Game const game = item as Game const gameKey = `${library.id}:game:${game.id}` upsertGame.run({ @@ -333,6 +387,7 @@ async function scanGames(library: Library, libraryRoot: string): Promise { wideCoverUrl: game.wideCoverUrl, }), file_path: null, + fingerprint: null, scanned_at: now, }) @@ -355,18 +410,9 @@ async function scanMixed(library: Library, libraryRoot: string): Promise { const db = getDb() const now = Date.now() - clearLibraryItems(db, library.id) - - const upsert = db.prepare(` - INSERT INTO media_items (library_id, item_key, item_type, title, file_path, scanned_at) - VALUES (@library_id, @item_key, @item_type, @title, @file_path, @scanned_at) - ON CONFLICT(item_key) DO UPDATE SET - title = excluded.title, - file_path = excluded.file_path, - scanned_at = excluded.scanned_at - `) - - let fileCount = 0 + // Collect all new items with fingerprints + type MixedEntry = { fingerprint: string | null; relPath: string; title: string } + const newItems = new Map() function walk(absDir: string, relDir: string): void { let dirents: import('fs').Dirent[] @@ -382,34 +428,58 @@ async function scanMixed(library: Library, libraryRoot: string): Promise { if (d.isDirectory()) { walk(path.join(absDir, name), relPath) } else { - const title = path.basename(name, path.extname(name)) - upsert.run({ - library_id: library.id, - item_key: `${library.id}:mixed_file:${encodeURIComponent(relPath)}`, - item_type: 'mixed_file', - title, - file_path: relPath, - scanned_at: now, + const itemKey = `${library.id}:mixed_file:${encodeURIComponent(relPath)}` + const absPath = path.join(absDir, name) + const fingerprint = computeFingerprint(absPath) + newItems.set(itemKey, { + fingerprint, + relPath, + title: path.basename(name, path.extname(name)), }) - fileCount++ - - const ext = path.extname(name).toLowerCase() - let mediaType: 'image' | 'video' | null = null - if (IMAGE_EXTENSIONS.has(ext)) mediaType = 'image' - else if (VIDEO_EXTENSIONS.has(ext)) mediaType = 'video' - if (mediaType) { - const absPath = path.join(absDir, name) - getThumbnailPath(absPath, library.id, mediaType).catch((err) => { - console.warn(`[scanner] Could not generate thumbnail for ${relPath}:`, err instanceof Error ? err.message : err) - }) - } } } } walk(libraryRoot, '') - console.log(`[scanner] mixed: indexed ${fileCount} files, pre-generating thumbnails`) + // Detect moves + reconcile + const moves = detectMoves(db, library.id, newItems) + reconcileAndPrune(db, library.id, new Set(newItems.keys()), moves) + + const upsert = db.prepare(` + INSERT INTO media_items (library_id, item_key, item_type, title, file_path, fingerprint, scanned_at) + VALUES (@library_id, @item_key, @item_type, @title, @file_path, @fingerprint, @scanned_at) + ON CONFLICT(item_key) DO UPDATE SET + title = excluded.title, + file_path = excluded.file_path, + fingerprint = excluded.fingerprint, + scanned_at = excluded.scanned_at + `) + + for (const [itemKey, { fingerprint, relPath, title }] of newItems) { + upsert.run({ + library_id: library.id, + item_key: itemKey, + item_type: 'mixed_file', + title, + file_path: relPath, + fingerprint, + scanned_at: now, + }) + + const ext = path.extname(relPath).toLowerCase() + let mediaType: 'image' | 'video' | null = null + if (IMAGE_EXTENSIONS.has(ext)) mediaType = 'image' + else if (VIDEO_EXTENSIONS.has(ext)) mediaType = 'video' + if (mediaType) { + const absPath = path.join(libraryRoot, relPath) + getThumbnailPath(absPath, library.id, mediaType).catch((err) => { + console.warn(`[scanner] Could not generate thumbnail for ${relPath}:`, err instanceof Error ? err.message : err) + }) + } + } + + console.log(`[scanner] mixed: indexed ${newItems.size} files, pre-generating thumbnails`) } // --------------------------------------------------------------------------- @@ -420,6 +490,73 @@ function clearLibraryItems(db: Database.Database, libraryId: string): void { db.prepare('DELETE FROM media_items WHERE library_id = ?').run(libraryId) } +/** + * Given a map of new items (item_key → { fingerprint }), compare against + * existing DB rows for this library to find items that moved (same fingerprint, + * different item_key). Returns an array of { oldKey, newKey } pairs. + * + * Only items that have a non-null fingerprint and whose old key is NOT already + * present in the new scan (i.e. the file truly moved, not a hash collision) + * are treated as moves. + */ +function detectMoves( + db: Database.Database, + libraryId: string, + newItems: Map +): Array<{ oldKey: string; newKey: string }> { + const existing = db + .prepare('SELECT item_key, fingerprint FROM media_items WHERE library_id = ? AND fingerprint IS NOT NULL') + .all(libraryId) as Array<{ item_key: string; fingerprint: string }> + + const fingerprintToOldKey = new Map() + for (const row of existing) { + fingerprintToOldKey.set(row.fingerprint, row.item_key) + } + + const moves: Array<{ oldKey: string; newKey: string }> = [] + for (const [newKey, { fingerprint }] of newItems) { + if (!fingerprint) continue + const oldKey = fingerprintToOldKey.get(fingerprint) + if (oldKey && oldKey !== newKey && !newItems.has(oldKey)) { + // File moved: same fingerprint, different key, old key is no longer present + moves.push({ oldKey, newKey }) + } + } + return moves +} + +/** + * Applies detected moves to the DB (renames item_key and updates media_tags), + * then deletes any rows for this library whose item_key is not in newKeys. + * Tags on deleted items are intentionally left as orphans — harmless and + * recoverable if the file reappears. + */ +function reconcileAndPrune( + db: Database.Database, + libraryId: string, + newKeys: Set, + moves: Array<{ oldKey: string; newKey: string }> +): void { + const renameItem = db.prepare('UPDATE media_items SET item_key = ? WHERE item_key = ?') + + for (const { oldKey, newKey } of moves) { + renameItem.run(newKey, oldKey) + reKeyMediaItem(oldKey, newKey) + console.log(`[scanner] fingerprint match: renamed "${oldKey}" → "${newKey}"`) + } + + const existing = db + .prepare('SELECT item_key FROM media_items WHERE library_id = ?') + .all(libraryId) as Array<{ item_key: string }> + + const deleteItem = db.prepare('DELETE FROM media_items WHERE item_key = ?') + for (const { item_key } of existing) { + if (!newKeys.has(item_key)) { + deleteItem.run(item_key) + } + } +} + /** * Extract the `path` query param from an /api/thumbnail URL and pre-warm * the thumbnail cache for that file. diff --git a/src/lib/tags.ts b/src/lib/tags.ts index 3386a52..9089ab6 100644 --- a/src/lib/tags.ts +++ b/src/lib/tags.ts @@ -260,3 +260,9 @@ export function removeAllAssignmentsForItem(mediaKey: string): void { const db = getDb() db.prepare("DELETE FROM media_tags WHERE media_key = ?").run(mediaKey) } + +export function reKeyMediaItem(oldKey: string, newKey: string): void { + getDb() + .prepare('UPDATE media_tags SET media_key = ? WHERE media_key = ?') + .run(newKey, oldKey) +}