import asyncio import hashlib import logging import os from pathlib import Path from datetime import datetime from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import select from app.models import MediaItem from app.database import SessionLocal log = logging.getLogger(__name__) _scanning: set[int] = set() def is_scanning(library_id: int) -> bool: return library_id in _scanning IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".avif", ".heic"} VIDEO_EXTENSIONS = {".mp4", ".mkv", ".mov", ".avi", ".webm", ".m4v", ".flv", ".wmv", ".ts"} def classify(path: Path) -> str | None: ext = path.suffix.lower() if ext in IMAGE_EXTENSIONS: return "image" if ext in VIDEO_EXTENSIONS: return "video" return None def hash_file(path: Path) -> str: h = hashlib.sha256() with path.open("rb") as f: for chunk in iter(lambda: f.read(65536), b""): h.update(chunk) return h.hexdigest() async def scan_library_background(library_id: int, library_path: str) -> None: """Run a full library scan in a fresh session. Safe to call as a background task.""" _scanning.add(library_id) try: async with SessionLocal() as db: await _do_scan(library_id, library_path, db) except Exception: log.exception("Scan failed for library %d at %s", library_id, library_path) finally: _scanning.discard(library_id) async def scan_library(library_id: int, library_path: str, db: AsyncSession) -> None: await _do_scan(library_id, library_path, db) async def _do_scan(library_id: int, library_path: str, db: AsyncSession) -> None: root = Path(library_path) log.info("Starting scan for library %d at %s", library_id, library_path) existing = await db.execute( select(MediaItem).where(MediaItem.library_id == library_id) ) db_items = {item.rel_path: item for item in existing.scalars().all()} seen_paths: set[str] = set() loop = asyncio.get_running_loop() total_dirs = 0 for dirpath, dirnames, filenames in os.walk(library_path): dirnames[:] = sorted(d for d in dirnames if not d.startswith(".")) dir = Path(dirpath) rel_dir = str(dir.relative_to(root)) if dir != root else "." found_in_dir = 0 for filename in sorted(f for f in filenames if not f.startswith(".")): file_path = dir / filename media_type = classify(file_path) if not media_type: continue rel = str(file_path.relative_to(root)) seen_paths.add(rel) found_in_dir += 1 if rel in db_items: item = db_items[rel] if item.missing: item.missing = False item.updated_at = datetime.utcnow() else: file_hash = await loop.run_in_executor(None, hash_file, file_path) moved = await _find_by_hash(library_id, file_hash, db) if moved: moved.rel_path = rel moved.filename = file_path.name moved.missing = False moved.updated_at = datetime.utcnow() else: db.add(MediaItem( library_id=library_id, rel_path=rel, filename=file_path.name, file_hash=file_hash, media_type=media_type, size_bytes=file_path.stat().st_size, missing=False, )) log.info("Scanned directory %s — %d media file(s) found", rel_dir, found_in_dir) total_dirs += 1 for rel_path, item in db_items.items(): if rel_path not in seen_paths and not item.missing: item.missing = True item.updated_at = datetime.utcnow() await db.commit() log.info( "Scan complete for library %d — %d director%s, %d media file(s) indexed", library_id, total_dirs, "y" if total_dirs == 1 else "ies", len(seen_paths), ) async def _find_by_hash(library_id: int, file_hash: str, db: AsyncSession) -> MediaItem | None: result = await db.execute( select(MediaItem).where( MediaItem.library_id == library_id, MediaItem.file_hash == file_hash, MediaItem.missing == True, # noqa: E712 ) ) return result.scalars().first()