Merge pull request 'fix blocking during scans' (#34 ) from large-library-fix into main

Reviewed-on: http://gitea.lan/gpatti/MediaLore/pulls/34
fix blocking during scans
2026-04-20 13:12:56 +00:00 · 2026-04-20 09:11:14 -04:00
5 changed files with 346 additions and 109 deletions
--- a/src/lib/comic-info.ts
+++ b/src/lib/comic-info.ts
@@ -1,6 +1,7 @@
 import AdmZip from 'adm-zip'
 import { XMLParser } from 'fast-xml-parser'
 import type { ComicInfoData } from '@/types'
+import { findZipEntry, extractZipEntry } from './zip-utils'

 const parser = new XMLParser()

@@ -70,3 +71,50 @@ export function parseComicInfo(absoluteCbzPath: string): ComicInfoData | null {
    web: toString(info.Web),
  }
 }
+
+/**
+ * Async version of parseComicInfo — reads only the ComicInfo.xml entry from the
+ * archive without loading the entire CBZ into memory. This is significantly faster
+ * for large libraries since it reads only the ZIP's central directory + the XML entry.
+ */
+export async function parseComicInfoAsync(absoluteCbzPath: string): Promise<ComicInfoData | null> {
+  try {
+    const entry = await findZipEntry(absoluteCbzPath, 'comicinfo.xml')
+    if (!entry) return null
+    const buf = await extractZipEntry(absoluteCbzPath, entry)
+    if (!buf) return null
+    return parseXml(buf.toString('utf-8'))
+  } catch {
+    return null
+  }
+}
+
+function parseXml(xml: string): ComicInfoData | null {
+  let doc: Record<string, unknown>
+  try {
+    doc = parser.parse(xml) as Record<string, unknown>
+  } catch {
+    return null
+  }
+
+  const info = (doc.ComicInfo ?? doc.ComicInfoXml ?? doc.comicinfo) as Record<string, unknown> | undefined
+  if (!info) return null
+
+  const rawTags = toString(info.Tags)
+  const tags: string[] = rawTags
+    ? rawTags.split(',').map((t) => t.trim()).filter(Boolean)
+    : []
+
+  return {
+    title: toString(info.Title),
+    year: toNumber(info.Year),
+    month: toNumber(info.Month),
+    day: toNumber(info.Day),
+    writer: toString(info.Writer),
+    translator: toString(info.Translator),
+    publisher: toString(info.Publisher),
+    genre: toString(info.Genre),
+    tags,
+    web: toString(info.Web),
+  }
+}
--- a/src/lib/comic-metadata.ts
+++ b/src/lib/comic-metadata.ts
@@ -3,7 +3,8 @@ import crypto from 'crypto'
 import type { Library, ImportedTag, TagMapping } from '@/types'
 import { getDb } from './db'
 import { resolveLibraryRoot } from './libraries'
-import { parseComicInfo } from './comic-info'
+import { parseComicInfoAsync } from './comic-info'
+import { mapConcurrent } from './zip-utils'

 // ─── Metadata Import ──────────────────────────────────────────────────────────

@@ -13,7 +14,7 @@ import { parseComicInfo } from './comic-info'
 * - For each tag: if a mapping exists, assigns the real tag; otherwise creates
 *   an imported tag entry.
 */
-export function importComicMetadata(library: Library): void {
+export async function importComicMetadata(library: Library): Promise<void> {
  const db = getDb()
  const libraryRoot = resolveLibraryRoot(library)

@@ -56,13 +57,25 @@ export function importComicMetadata(library: Library): void {

  let importedCount = 0

+  // Process in batches: async file reads (10 concurrent) followed by batch DB writes,
+  // with an event-loop yield between batches to keep the app responsive.
+  const BATCH_SIZE = 50
+  for (let i = 0; i < issues.length; i += BATCH_SIZE) {
+    const batch = issues.slice(i, i + BATCH_SIZE)
+
+    // Async: read ComicInfo.xml from each archive concurrently (10 at a time).
+    // Uses async ZIP central-directory reader — no full-file reads.
+    const infos = await mapConcurrent(batch, 10, (issue) =>
+      parseComicInfoAsync(path.join(libraryRoot, issue.file_path))
+    )
+
+    // Sync: write this batch to the DB in one transaction.
    db.transaction(() => {
-    for (const issue of issues) {
-      const absPath = path.join(libraryRoot, issue.file_path)
-      const info = parseComicInfo(absPath)
+      for (let j = 0; j < batch.length; j++) {
+        const issue = batch[j]
+        const info = infos[j]
        if (!info) continue

-      // Merge with existing metadata JSON (preserve pageCount, coverUrl, etc.)
        const existingMeta = issue.metadata ? JSON.parse(issue.metadata) : {}
        const mergedMeta = {
          ...existingMeta,
@@ -82,14 +95,11 @@ export function importComicMetadata(library: Library): void {
          metadata: JSON.stringify(mergedMeta),
        })

-      // Process tags
        for (const tagName of info.tags) {
          const mappedTagId = mappings.get(tagName)
          if (mappedTagId) {
-          // Mapping exists — assign the real tag
            addMediaTag.run(issue.item_key, mappedTagId)
          } else {
-          // No mapping — create imported tag
            const importedTagId = crypto.randomUUID()
            const row = upsertImportedTag.get({
              id: importedTagId,
@@ -104,6 +114,9 @@ export function importComicMetadata(library: Library): void {
      }
    })()

+    await new Promise<void>((r) => setImmediate(r))
+  }
+
  console.log(`[comic-metadata] Imported metadata for ${importedCount}/${issues.length} issues in "${library.name}"`)
 }

--- a/src/lib/comics.ts
+++ b/src/lib/comics.ts
@@ -4,6 +4,7 @@ import AdmZip from 'adm-zip'
 import type { ComicIssue, ComicSeries } from '@/types'
 import { getDb } from './db'
 import { HIDDEN_FILES, thumbnailApiUrl } from './media-utils'
+import { countZipImages, mapConcurrent } from './zip-utils'

 const CBZ_EXTENSIONS = new Set(['.cbz'])
 const CBZ_IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.webp', '.gif'])
@@ -23,52 +24,22 @@ function parseIssueNumber(filename: string): number | null {
  return parseInt(matches[matches.length - 1], 10)
 }

-function getPageCount(absoluteCbzPath: string): number {
-  try {
-    const zip = new AdmZip(absoluteCbzPath)
-    return zip
-      .getEntries()
-      .filter(
-        (e) =>
-          !e.isDirectory &&
-          CBZ_IMAGE_EXTENSIONS.has(path.extname(e.entryName).toLowerCase())
-      ).length
-  } catch {
-    return 0
-  }
-}
-
-function buildIssue(
-  absFilePath: string,
-  filename: string,
-  filePath: string,
-  libraryId: string,
-  isStandalone: boolean
-): ComicIssue {
-  const title = path.basename(filename, path.extname(filename))
-  const issueNumber = parseIssueNumber(filename)
-  const pageCount = getPageCount(absFilePath)
-  const coverUrl = thumbnailApiUrl(libraryId, filePath)
-
-  return {
-    id: encodeURIComponent(filePath),
-    title,
-    issueNumber,
-    pageCount,
-    coverUrl,
-    filePath,
-    isStandalone,
-  }
-}
-
 export interface ScannedComicSeries extends ComicSeries {
  issues: ComicIssue[]
 }

-export function scanComicsLibrary(
+interface CollectedCbz {
+  absPath: string
+  filename: string
+  relPath: string
+  isStandalone: boolean
+  seriesDirName: string | null
+}
+
+export async function scanComicsLibrary(
  libraryRoot: string,
  libraryId: string
-): (ComicIssue | ScannedComicSeries)[] {
+): Promise<(ComicIssue | ScannedComicSeries)[]> {
  let topEntries: fs.Dirent[]
  try {
    topEntries = fs.readdirSync(libraryRoot, { withFileTypes: true })
@@ -76,15 +47,20 @@ export function scanComicsLibrary(
    return []
  }

-  const results: (ComicIssue | ScannedComicSeries)[] = []
+  // Phase 1: Collect all CBZ paths via fast directory listing (no archive opens).
+  const collected: CollectedCbz[] = []

  for (const entry of topEntries) {
    if (HIDDEN_FILES.test(entry.name)) continue

    if (entry.isFile() && isCbzFile(entry.name)) {
-      // Standalone one-shot comic
-      const absPath = path.join(libraryRoot, entry.name)
-      results.push(buildIssue(absPath, entry.name, entry.name, libraryId, true))
+      collected.push({
+        absPath: path.join(libraryRoot, entry.name),
+        filename: entry.name,
+        relPath: entry.name,
+        isStandalone: true,
+        seriesDirName: null,
+      })
      continue
    }

@@ -97,32 +73,70 @@ export function scanComicsLibrary(
        continue
      }

-      const cbzFiles = subEntries.filter(
-        (e) => e.isFile() && isCbzFile(e.name) && !HIDDEN_FILES.test(e.name)
-      )
+      const cbzFiles = subEntries
+        .filter((e) => e.isFile() && isCbzFile(e.name) && !HIDDEN_FILES.test(e.name))
+        .sort((a, b) => naturalCompare(a.name, b.name))

      if (cbzFiles.length === 0) continue

-      // It's a series
-      const issues: ComicIssue[] = cbzFiles
-        .sort((a, b) => naturalCompare(a.name, b.name))
-        .map((f) => {
-          const relPath = path.join(entry.name, f.name)
-          return buildIssue(path.join(dirAbsPath, f.name), f.name, relPath, libraryId, false)
-        })
-
-      const seriesCoverUrl = issues[0]?.coverUrl ?? null
-
-      results.push({
-        id: encodeURIComponent(entry.name),
-        title: entry.name,
-        coverUrl: seriesCoverUrl,
-        issueCount: issues.length,
-        issues,
+      for (const f of cbzFiles) {
+        collected.push({
+          absPath: path.join(dirAbsPath, f.name),
+          filename: f.name,
+          relPath: path.join(entry.name, f.name),
+          isStandalone: false,
+          seriesDirName: entry.name,
        })
      }
    }
+  }

+  // Phase 2: Count pages for all CBZ files concurrently (10 at a time) by reading
+  // only each archive's central directory — no full-file reads.
+  const pageCounts = await mapConcurrent(collected, 10, (c) =>
+    countZipImages(c.absPath, CBZ_IMAGE_EXTENSIONS)
+  )
+
+  // Phase 3: Build the result array from collected metadata + page counts.
+  const seriesMap = new Map<string, ScannedComicSeries>()
+  const standaloneIssues: ComicIssue[] = []
+
+  for (let i = 0; i < collected.length; i++) {
+    const c = collected[i]
+    const coverUrl = thumbnailApiUrl(libraryId, c.relPath)
+    const issue: ComicIssue = {
+      id: encodeURIComponent(c.relPath),
+      title: path.basename(c.filename, path.extname(c.filename)),
+      issueNumber: parseIssueNumber(c.filename),
+      pageCount: pageCounts[i],
+      coverUrl,
+      filePath: c.relPath,
+      isStandalone: c.isStandalone,
+    }
+
+    if (c.isStandalone) {
+      standaloneIssues.push(issue)
+    } else {
+      const key = c.seriesDirName!
+      if (!seriesMap.has(key)) {
+        seriesMap.set(key, {
+          id: encodeURIComponent(key),
+          title: key,
+          coverUrl,  // first issue (sorted) becomes the series cover
+          issueCount: 0,
+          issues: [],
+        })
+      }
+      const series = seriesMap.get(key)!
+      series.issues.push(issue)
+      series.issueCount++
+    }
+  }
+
+  const results: (ComicIssue | ScannedComicSeries)[] = [
+    ...Array.from(seriesMap.values()),
+    ...standaloneIssues,
+  ]
  return results.sort((a, b) => naturalCompare(a.title, b.title))
 }

--- a/src/lib/scanner.ts
+++ b/src/lib/scanner.ts
@@ -546,7 +546,7 @@ async function scanMixed(library: Library, libraryRoot: string): Promise<void> {
 // ---------------------------------------------------------------------------

 async function scanComics(library: Library, libraryRoot: string): Promise<void> {
-  const items = scanComicsLibrary(libraryRoot, library.id)
+  const items = await scanComicsLibrary(libraryRoot, library.id)
  const db = getDb()
  const now = Date.now()

@@ -678,7 +678,7 @@ async function scanComics(library: Library, libraryRoot: string): Promise<void>

  // Import ComicInfo.xml metadata (title, year, genres, tags)
  try {
-    importComicMetadata(library)
+    await importComicMetadata(library)
  } catch (err) {
    console.error(`[scanner]   Error importing comic metadata for "${library.name}":`, err)
  }
--- a/src/lib/zip-utils.ts
+++ b/src/lib/zip-utils.ts
@@ -0,0 +1,162 @@
+import { open } from 'fs/promises'
+import type { FileHandle } from 'fs/promises'
+import zlib from 'zlib'
+import { promisify } from 'util'
+
+const inflateRaw = promisify(zlib.inflateRaw)
+
+const EOCD_SIG = 0x06054b50
+const CD_SIG   = 0x02014b50
+const LFH_SIG  = 0x04034b50
+
+export interface CdEntry {
+  name: string
+  compressionMethod: number
+  compressedSize: number
+  uncompressedSize: number
+  localHeaderOffset: number
+}
+
+/**
+ * Read a ZIP file's central directory without loading the entire archive.
+ * Opens only the last ~22–64KB of the file (EOCD + central directory).
+ */
+async function readCentralDirectory(fd: FileHandle, fileSize: number): Promise<CdEntry[]> {
+  if (fileSize < 22) return []
+
+  // The EOCD record is within the last 65558 bytes (22-byte record + 65535-byte max comment).
+  const tailLen = Math.min(65558, fileSize)
+  const tailBuf = Buffer.allocUnsafe(tailLen)
+  await fd.read(tailBuf, 0, tailLen, fileSize - tailLen)
+
+  // Scan backwards for the EOCD signature.
+  let eocdOff = -1
+  for (let i = tailLen - 22; i >= 0; i--) {
+    if (tailBuf.readUInt32LE(i) === EOCD_SIG) { eocdOff = i; break }
+  }
+  if (eocdOff === -1) return []
+
+  const entryCount = tailBuf.readUInt16LE(eocdOff + 10)
+  const cdSize     = tailBuf.readUInt32LE(eocdOff + 12)
+  const cdOffset   = tailBuf.readUInt32LE(eocdOff + 16)
+  if (cdOffset + cdSize > fileSize || cdSize === 0) return []
+
+  const cdBuf = Buffer.allocUnsafe(cdSize)
+  await fd.read(cdBuf, 0, cdSize, cdOffset)
+
+  const entries: CdEntry[] = []
+  let pos = 0
+  for (let i = 0; i < entryCount && pos + 46 <= cdBuf.length; i++) {
+    if (cdBuf.readUInt32LE(pos) !== CD_SIG) break
+    const compressionMethod  = cdBuf.readUInt16LE(pos + 10)
+    const compressedSize     = cdBuf.readUInt32LE(pos + 20)
+    const uncompressedSize   = cdBuf.readUInt32LE(pos + 24)
+    const filenameLen        = cdBuf.readUInt16LE(pos + 28)
+    const extraLen           = cdBuf.readUInt16LE(pos + 30)
+    const commentLen         = cdBuf.readUInt16LE(pos + 32)
+    const localHeaderOffset  = cdBuf.readUInt32LE(pos + 42)
+    const name = cdBuf.toString('utf8', pos + 46, pos + 46 + filenameLen)
+    entries.push({ name, compressionMethod, compressedSize, uncompressedSize, localHeaderOffset })
+    pos += 46 + filenameLen + extraLen + commentLen
+  }
+  return entries
+}
+
+/**
+ * Count the number of image entries inside a ZIP/CBZ archive by reading
+ * only its central directory — no full-file read required.
+ */
+export async function countZipImages(
+  absolutePath: string,
+  imageExtensions: Set<string>
+): Promise<number> {
+  let fd: FileHandle | null = null
+  try {
+    fd = await open(absolutePath, 'r')
+    const { size } = await fd.stat()
+    const entries = await readCentralDirectory(fd, size)
+    return entries.filter((e) => {
+      if (e.name.endsWith('/')) return false
+      const dot = e.name.lastIndexOf('.')
+      return dot !== -1 && imageExtensions.has(e.name.slice(dot).toLowerCase())
+    }).length
+  } catch {
+    return 0
+  } finally {
+    await fd?.close()
+  }
+}
+
+/**
+ * Extract the raw bytes of a specific entry from a ZIP archive.
+ * Reads only the local file header + compressed data for that entry.
+ * Supports stored (method 0) and deflate (method 8).
+ */
+export async function extractZipEntry(absolutePath: string, entry: CdEntry): Promise<Buffer | null> {
+  let fd: FileHandle | null = null
+  try {
+    fd = await open(absolutePath, 'r')
+
+    // Read local file header (30 bytes) to get exact data offset.
+    const lfhBuf = Buffer.allocUnsafe(30)
+    await fd.read(lfhBuf, 0, 30, entry.localHeaderOffset)
+    if (lfhBuf.readUInt32LE(0) !== LFH_SIG) return null
+    const localFilenameLen = lfhBuf.readUInt16LE(26)
+    const localExtraLen    = lfhBuf.readUInt16LE(28)
+    const dataOffset = entry.localHeaderOffset + 30 + localFilenameLen + localExtraLen
+
+    const compressedBuf = Buffer.allocUnsafe(entry.compressedSize)
+    await fd.read(compressedBuf, 0, entry.compressedSize, dataOffset)
+
+    if (entry.compressionMethod === 0) return compressedBuf
+    if (entry.compressionMethod === 8) return await inflateRaw(compressedBuf) as Buffer
+    return null
+  } catch {
+    return null
+  } finally {
+    await fd?.close()
+  }
+}
+
+/**
+ * Find a named entry (case-insensitive) in a ZIP archive's central directory.
+ * Returns null if not found or on error.
+ */
+export async function findZipEntry(absolutePath: string, entryName: string): Promise<CdEntry | null> {
+  let fd: FileHandle | null = null
+  try {
+    fd = await open(absolutePath, 'r')
+    const { size } = await fd.stat()
+    const entries = await readCentralDirectory(fd, size)
+    const lower = entryName.toLowerCase()
+    return entries.find((e) => {
+      const n = e.name.toLowerCase()
+      return n === lower || n.endsWith('/' + lower)
+    }) ?? null
+  } catch {
+    return null
+  } finally {
+    await fd?.close()
+  }
+}
+
+/**
+ * Process an array of items concurrently with a concurrency limit.
+ * Preserves index order in results.
+ */
+export async function mapConcurrent<T, U>(
+  items: T[],
+  limit: number,
+  fn: (item: T) => Promise<U>
+): Promise<U[]> {
+  const results: U[] = new Array(items.length)
+  let next = 0
+  async function worker(): Promise<void> {
+    while (next < items.length) {
+      const i = next++
+      results[i] = await fn(items[i])
+    }
+  }
+  await Promise.all(Array.from({ length: Math.min(limit, items.length) }, worker))
+  return results
+}
Author	SHA1	Message	Date
gpatti	7d2ae7e95c	Merge pull request 'fix blocking during scans' (#34 ) from large-library-fix into main All checks were successful Build and Push Docker Image / build (push) Successful in 1m4s Details Reviewed-on: http://gitea.lan/gpatti/MediaLore/pulls/34	2026-04-20 13:12:56 +00:00
Garret Patti	cedc012733	fix blocking during scans	2026-04-20 09:11:14 -04:00