Files
MediaLore-Web-App/backend/app/services/scanner.py
2026-05-09 14:51:25 -04:00

135 lines
4.4 KiB
Python

import asyncio
import hashlib
import logging
import os
from pathlib import Path
from datetime import datetime
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
from app.models import MediaItem
from app.database import SessionLocal
log = logging.getLogger(__name__)
_scanning: set[int] = set()
def is_scanning(library_id: int) -> bool:
return library_id in _scanning
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".avif", ".heic"}
VIDEO_EXTENSIONS = {".mp4", ".mkv", ".mov", ".avi", ".webm", ".m4v", ".flv", ".wmv", ".ts"}
def classify(path: Path) -> str | None:
ext = path.suffix.lower()
if ext in IMAGE_EXTENSIONS:
return "image"
if ext in VIDEO_EXTENSIONS:
return "video"
return None
def hash_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
async def scan_library_background(library_id: int, library_path: str) -> None:
"""Run a full library scan in a fresh session. Safe to call as a background task."""
_scanning.add(library_id)
try:
async with SessionLocal() as db:
await _do_scan(library_id, library_path, db)
except Exception:
log.exception("Scan failed for library %d at %s", library_id, library_path)
finally:
_scanning.discard(library_id)
async def scan_library(library_id: int, library_path: str, db: AsyncSession) -> None:
await _do_scan(library_id, library_path, db)
async def _do_scan(library_id: int, library_path: str, db: AsyncSession) -> None:
root = Path(library_path)
log.info("Starting scan for library %d at %s", library_id, library_path)
existing = await db.execute(
select(MediaItem).where(MediaItem.library_id == library_id)
)
db_items = {item.rel_path: item for item in existing.scalars().all()}
seen_paths: set[str] = set()
loop = asyncio.get_running_loop()
total_dirs = 0
for dirpath, dirnames, filenames in os.walk(library_path):
dirnames[:] = sorted(d for d in dirnames if not d.startswith("."))
dir = Path(dirpath)
rel_dir = str(dir.relative_to(root)) if dir != root else "."
found_in_dir = 0
for filename in sorted(f for f in filenames if not f.startswith(".")):
file_path = dir / filename
media_type = classify(file_path)
if not media_type:
continue
rel = str(file_path.relative_to(root))
seen_paths.add(rel)
found_in_dir += 1
if rel in db_items:
item = db_items[rel]
if item.missing:
item.missing = False
item.updated_at = datetime.utcnow()
else:
file_hash = await loop.run_in_executor(None, hash_file, file_path)
moved = await _find_by_hash(library_id, file_hash, db)
if moved:
moved.rel_path = rel
moved.filename = file_path.name
moved.missing = False
moved.updated_at = datetime.utcnow()
else:
db.add(MediaItem(
library_id=library_id,
rel_path=rel,
filename=file_path.name,
file_hash=file_hash,
media_type=media_type,
size_bytes=file_path.stat().st_size,
missing=False,
))
log.info("Scanned directory %s%d media file(s) found", rel_dir, found_in_dir)
total_dirs += 1
for rel_path, item in db_items.items():
if rel_path not in seen_paths and not item.missing:
item.missing = True
item.updated_at = datetime.utcnow()
await db.commit()
log.info(
"Scan complete for library %d%d director%s, %d media file(s) indexed",
library_id, total_dirs, "y" if total_dirs == 1 else "ies", len(seen_paths),
)
async def _find_by_hash(library_id: int, file_hash: str, db: AsyncSession) -> MediaItem | None:
result = await db.execute(
select(MediaItem).where(
MediaItem.library_id == library_id,
MediaItem.file_hash == file_hash,
MediaItem.missing == True, # noqa: E712
)
)
return result.scalars().first()