build ocr

2026-03-28 01:59:13 -05:00
parent e1145b9448
commit 0e03cec842
14 changed files with 379 additions and 22 deletions
@@ -13,6 +13,7 @@
    "@fastify/static": "^7.0.4",
    "better-sqlite3": "^9.4.3",
    "fastify": "^4.27.0",
+    "node-tesseract-ocr": "^2.2.1",
    "sharp": "^0.33.4",
    "uuid": "^9.0.1"
  },
@@ -34,6 +34,7 @@ db.exec(`
    height        INTEGER NOT NULL,
    parent_id     TEXT REFERENCES memes(id) ON DELETE CASCADE,
    collection_id INTEGER REFERENCES collections(id) ON DELETE SET NULL,
+    ocr_text      TEXT,
    created_at    TEXT NOT NULL DEFAULT (datetime('now'))
  );

@@ -54,15 +55,22 @@ db.exec(`
  CREATE INDEX IF NOT EXISTS idx_meme_tags_tag_id ON meme_tags(tag_id);
 `);

-// Migration: add collection_id column if upgrading from earlier schema
-// Must run BEFORE creating the index on that column
+// Migrations — run after CREATE TABLE IF NOT EXISTS so they only apply to existing DBs
 const memesCols = db.prepare('PRAGMA table_info(memes)').all() as { name: string }[];
+
 if (!memesCols.find((c) => c.name === 'collection_id')) {
  db.exec('ALTER TABLE memes ADD COLUMN collection_id INTEGER REFERENCES collections(id) ON DELETE SET NULL');
 }

-// Create index after the column is guaranteed to exist (handles both fresh and migrated DBs)
-db.exec('CREATE INDEX IF NOT EXISTS idx_memes_collection_id ON memes(collection_id)');
+if (!memesCols.find((c) => c.name === 'ocr_text')) {
+  db.exec('ALTER TABLE memes ADD COLUMN ocr_text TEXT');
+}
+
+// Indexes that depend on migrated columns — created after columns are guaranteed to exist
+db.exec(`
+  CREATE INDEX IF NOT EXISTS idx_memes_collection_id ON memes(collection_id);
+  CREATE INDEX IF NOT EXISTS idx_memes_ocr ON memes(ocr_text) WHERE ocr_text IS NOT NULL;
+`);

 // Seed the default UNSORTED collection
 const defaultCollection = db
@@ -8,6 +8,7 @@ import { memesRoutes } from './routes/memes.js';
 import { tagsRoutes } from './routes/tags.js';
 import { authRoutes } from './routes/auth.js';
 import { collectionsRoutes } from './routes/collections.js';
+import { adminRoutes } from './routes/admin.js';

 // Ensure data dirs exist
 ensureImagesDir();
@@ -41,6 +42,7 @@ await app.register(authRoutes);
 await app.register(collectionsRoutes);
 await app.register(memesRoutes);
 await app.register(tagsRoutes);
+await app.register(adminRoutes);

 // SPA fallback — serve index.html for all non-API, non-image routes
 app.setNotFoundHandler(async (req, reply) => {
@@ -0,0 +1,52 @@
+import type { FastifyInstance } from 'fastify';
+import db from '../db.js';
+import { requireAuth } from '../auth.js';
+import { extractText } from '../services/ocr.js';
+import type { Meme } from '../types.js';
+
+export async function adminRoutes(app: FastifyInstance) {
+  /**
+   * POST /api/admin/reindex
+   * Re-runs OCR on every meme that has no ocr_text yet.
+   * Processes sequentially to avoid hammering the CPU.
+   * Returns counts so the caller knows progress.
+   */
+  app.post('/api/admin/reindex', { preHandler: requireAuth }, async (_req, reply) => {
+    const pending = db
+      .prepare('SELECT id, file_path, mime_type FROM memes WHERE ocr_text IS NULL')
+      .all() as Pick<Meme, 'id' | 'file_path' | 'mime_type'>[];
+
+    reply.raw.setHeader('Content-Type', 'application/json');
+
+    let done = 0;
+    let failed = 0;
+
+    for (const meme of pending) {
+      const text = await extractText(meme.file_path, meme.mime_type);
+      if (text) {
+        db.prepare('UPDATE memes SET ocr_text = ? WHERE id = ?').run(text, meme.id);
+        done++;
+      } else {
+        // Store empty string so it won't be retried on subsequent runs
+        db.prepare("UPDATE memes SET ocr_text = '' WHERE id = ?").run(meme.id);
+        failed++;
+      }
+    }
+
+    return { total: pending.length, indexed: done, no_text_found: failed };
+  });
+
+  /**
+   * GET /api/admin/reindex/status
+   * Returns how many memes still need OCR indexing.
+   */
+  app.get('/api/admin/reindex/status', { preHandler: requireAuth }, async () => {
+    const { pending } = db
+      .prepare('SELECT COUNT(*) as pending FROM memes WHERE ocr_text IS NULL')
+      .get() as { pending: number };
+    const { indexed } = db
+      .prepare("SELECT COUNT(*) as indexed FROM memes WHERE ocr_text IS NOT NULL AND ocr_text != ''")
+      .get() as { indexed: number };
+    return { pending, indexed };
+  });
+}
@@ -4,6 +4,7 @@ import { v4 as uuidv4 } from 'uuid';
 import db, { UNSORTED_ID } from '../db.js';
 import { buildFilePath, deleteFile, getExtension } from '../services/storage.js';
 import { extractMeta, resizeImage, saveBuffer } from '../services/image.js';
+import { extractText } from '../services/ocr.js';
 import { requireAuth } from '../auth.js';
 import type { ListQuery, UpdateBody, RescaleBody, MoveBody, Meme } from '../types.js';

@@ -72,8 +73,8 @@ export async function memesRoutes(app: FastifyInstance) {
    }

    if (q) {
-      conditions.push('(m.title LIKE ? OR m.description LIKE ?)');
-      params.push(`%${q}%`, `%${q}%`);
+      conditions.push('(m.title LIKE ? OR m.description LIKE ? OR m.ocr_text LIKE ?)');
+      params.push(`%${q}%`, `%${q}%`, `%${q}%`);
    }

    if (conditions.length) {
@@ -98,7 +99,7 @@ export async function memesRoutes(app: FastifyInstance) {
      countParams.push(tag.toLowerCase());
    }
    if (collection_id !== undefined) countParams.push(Number(collection_id));
-    if (q) countParams.push(`%${q}%`, `%${q}%`);
+    if (q) countParams.push(`%${q}%`, `%${q}%`, `%${q}%`);

    if (countConditions.length) countSql += ' WHERE ' + countConditions.join(' AND ');

@@ -159,6 +160,11 @@ export async function memesRoutes(app: FastifyInstance) {

    if (tagsRaw) setMemeTags(id, tagsRaw.split(','));

+    // Fire OCR in the background — doesn't block the upload response
+    extractText(filePath, mimeType).then((text) => {
+      if (text) db.prepare('UPDATE memes SET ocr_text = ? WHERE id = ?').run(text, id);
+    });
+
    return reply.status(201).send(getMemeById(id));
  });

@@ -0,0 +1,47 @@
+import tesseract from 'node-tesseract-ocr';
+import sharp from 'sharp';
+import fs from 'fs';
+import path from 'path';
+import { absolutePath } from './storage.js';
+
+const OCR_CONFIG = {
+  lang: 'eng',
+  oem: 1,  // LSTM neural net mode — best accuracy
+  psm: 3,  // Fully automatic page segmentation (good for varied meme layouts)
+};
+
+export async function extractText(relPath: string, mimeType: string): Promise<string> {
+  const srcAbs = absolutePath(relPath);
+  let inputPath = srcAbs;
+  let tempPath: string | null = null;
+
+  try {
+    // Animated GIFs: extract first frame as PNG for Tesseract (it can't read GIF directly)
+    if (mimeType === 'image/gif') {
+      tempPath = `${srcAbs}.ocr_tmp.png`;
+      await sharp(srcAbs, { animated: false }).png().toFile(tempPath);
+      inputPath = tempPath;
+    }
+
+    const raw = await tesseract.recognize(inputPath, OCR_CONFIG);
+
+    // Clean up: collapse whitespace, strip lines that are pure noise (< 2 chars)
+    const cleaned = raw
+      .split('\n')
+      .map((l) => l.trim())
+      .filter((l) => l.length >= 2)
+      .join(' ')
+      .replace(/\s{2,}/g, ' ')
+      .trim();
+
+    return cleaned;
+  } catch (err) {
+    // OCR failure is non-fatal — image still gets saved, just won't be text-searchable
+    console.warn(`OCR failed for ${relPath}:`, (err as Error).message);
+    return '';
+  } finally {
+    if (tempPath && fs.existsSync(tempPath)) {
+      fs.unlinkSync(tempPath);
+    }
+  }
+}
@@ -10,6 +10,7 @@ export interface Meme {
  height: number;
  parent_id: string | null;
  collection_id: number | null;
+  ocr_text: string | null;
  created_at: string;
  tags: string[];
 }