build ocr

This commit is contained in:
2026-03-28 01:59:13 -05:00
parent e1145b9448
commit 0e03cec842
14 changed files with 379 additions and 22 deletions

View File

@@ -13,6 +13,7 @@
"@fastify/static": "^7.0.4",
"better-sqlite3": "^9.4.3",
"fastify": "^4.27.0",
"node-tesseract-ocr": "^2.2.1",
"sharp": "^0.33.4",
"uuid": "^9.0.1"
},

View File

@@ -34,6 +34,7 @@ db.exec(`
height INTEGER NOT NULL,
parent_id TEXT REFERENCES memes(id) ON DELETE CASCADE,
collection_id INTEGER REFERENCES collections(id) ON DELETE SET NULL,
ocr_text TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
);
@@ -54,15 +55,22 @@ db.exec(`
CREATE INDEX IF NOT EXISTS idx_meme_tags_tag_id ON meme_tags(tag_id);
`);
// Migration: add collection_id column if upgrading from earlier schema
// Must run BEFORE creating the index on that column
// Migrations — run after CREATE TABLE IF NOT EXISTS so they only apply to existing DBs
const memesCols = db.prepare('PRAGMA table_info(memes)').all() as { name: string }[];
if (!memesCols.find((c) => c.name === 'collection_id')) {
db.exec('ALTER TABLE memes ADD COLUMN collection_id INTEGER REFERENCES collections(id) ON DELETE SET NULL');
}
// Create index after the column is guaranteed to exist (handles both fresh and migrated DBs)
db.exec('CREATE INDEX IF NOT EXISTS idx_memes_collection_id ON memes(collection_id)');
if (!memesCols.find((c) => c.name === 'ocr_text')) {
db.exec('ALTER TABLE memes ADD COLUMN ocr_text TEXT');
}
// Indexes that depend on migrated columns — created after columns are guaranteed to exist
db.exec(`
CREATE INDEX IF NOT EXISTS idx_memes_collection_id ON memes(collection_id);
CREATE INDEX IF NOT EXISTS idx_memes_ocr ON memes(ocr_text) WHERE ocr_text IS NOT NULL;
`);
// Seed the default UNSORTED collection
const defaultCollection = db

View File

@@ -8,6 +8,7 @@ import { memesRoutes } from './routes/memes.js';
import { tagsRoutes } from './routes/tags.js';
import { authRoutes } from './routes/auth.js';
import { collectionsRoutes } from './routes/collections.js';
import { adminRoutes } from './routes/admin.js';
// Ensure data dirs exist
ensureImagesDir();
@@ -41,6 +42,7 @@ await app.register(authRoutes);
await app.register(collectionsRoutes);
await app.register(memesRoutes);
await app.register(tagsRoutes);
await app.register(adminRoutes);
// SPA fallback — serve index.html for all non-API, non-image routes
app.setNotFoundHandler(async (req, reply) => {

View File

@@ -0,0 +1,52 @@
import type { FastifyInstance } from 'fastify';
import db from '../db.js';
import { requireAuth } from '../auth.js';
import { extractText } from '../services/ocr.js';
import type { Meme } from '../types.js';
export async function adminRoutes(app: FastifyInstance) {
/**
* POST /api/admin/reindex
* Re-runs OCR on every meme that has no ocr_text yet.
* Processes sequentially to avoid hammering the CPU.
* Returns counts so the caller knows progress.
*/
app.post('/api/admin/reindex', { preHandler: requireAuth }, async (_req, reply) => {
const pending = db
.prepare('SELECT id, file_path, mime_type FROM memes WHERE ocr_text IS NULL')
.all() as Pick<Meme, 'id' | 'file_path' | 'mime_type'>[];
reply.raw.setHeader('Content-Type', 'application/json');
let done = 0;
let failed = 0;
for (const meme of pending) {
const text = await extractText(meme.file_path, meme.mime_type);
if (text) {
db.prepare('UPDATE memes SET ocr_text = ? WHERE id = ?').run(text, meme.id);
done++;
} else {
// Store empty string so it won't be retried on subsequent runs
db.prepare("UPDATE memes SET ocr_text = '' WHERE id = ?").run(meme.id);
failed++;
}
}
return { total: pending.length, indexed: done, no_text_found: failed };
});
/**
* GET /api/admin/reindex/status
* Returns how many memes still need OCR indexing.
*/
app.get('/api/admin/reindex/status', { preHandler: requireAuth }, async () => {
const { pending } = db
.prepare('SELECT COUNT(*) as pending FROM memes WHERE ocr_text IS NULL')
.get() as { pending: number };
const { indexed } = db
.prepare("SELECT COUNT(*) as indexed FROM memes WHERE ocr_text IS NOT NULL AND ocr_text != ''")
.get() as { indexed: number };
return { pending, indexed };
});
}

View File

@@ -4,6 +4,7 @@ import { v4 as uuidv4 } from 'uuid';
import db, { UNSORTED_ID } from '../db.js';
import { buildFilePath, deleteFile, getExtension } from '../services/storage.js';
import { extractMeta, resizeImage, saveBuffer } from '../services/image.js';
import { extractText } from '../services/ocr.js';
import { requireAuth } from '../auth.js';
import type { ListQuery, UpdateBody, RescaleBody, MoveBody, Meme } from '../types.js';
@@ -72,8 +73,8 @@ export async function memesRoutes(app: FastifyInstance) {
}
if (q) {
conditions.push('(m.title LIKE ? OR m.description LIKE ?)');
params.push(`%${q}%`, `%${q}%`);
conditions.push('(m.title LIKE ? OR m.description LIKE ? OR m.ocr_text LIKE ?)');
params.push(`%${q}%`, `%${q}%`, `%${q}%`);
}
if (conditions.length) {
@@ -98,7 +99,7 @@ export async function memesRoutes(app: FastifyInstance) {
countParams.push(tag.toLowerCase());
}
if (collection_id !== undefined) countParams.push(Number(collection_id));
if (q) countParams.push(`%${q}%`, `%${q}%`);
if (q) countParams.push(`%${q}%`, `%${q}%`, `%${q}%`);
if (countConditions.length) countSql += ' WHERE ' + countConditions.join(' AND ');
@@ -159,6 +160,11 @@ export async function memesRoutes(app: FastifyInstance) {
if (tagsRaw) setMemeTags(id, tagsRaw.split(','));
// Fire OCR in the background — doesn't block the upload response
extractText(filePath, mimeType).then((text) => {
if (text) db.prepare('UPDATE memes SET ocr_text = ? WHERE id = ?').run(text, id);
});
return reply.status(201).send(getMemeById(id));
});

View File

@@ -0,0 +1,47 @@
import tesseract from 'node-tesseract-ocr';
import sharp from 'sharp';
import fs from 'fs';
import path from 'path';
import { absolutePath } from './storage.js';
const OCR_CONFIG = {
lang: 'eng',
oem: 1, // LSTM neural net mode — best accuracy
psm: 3, // Fully automatic page segmentation (good for varied meme layouts)
};
export async function extractText(relPath: string, mimeType: string): Promise<string> {
const srcAbs = absolutePath(relPath);
let inputPath = srcAbs;
let tempPath: string | null = null;
try {
// Animated GIFs: extract first frame as PNG for Tesseract (it can't read GIF directly)
if (mimeType === 'image/gif') {
tempPath = `${srcAbs}.ocr_tmp.png`;
await sharp(srcAbs, { animated: false }).png().toFile(tempPath);
inputPath = tempPath;
}
const raw = await tesseract.recognize(inputPath, OCR_CONFIG);
// Clean up: collapse whitespace, strip lines that are pure noise (< 2 chars)
const cleaned = raw
.split('\n')
.map((l) => l.trim())
.filter((l) => l.length >= 2)
.join(' ')
.replace(/\s{2,}/g, ' ')
.trim();
return cleaned;
} catch (err) {
// OCR failure is non-fatal — image still gets saved, just won't be text-searchable
console.warn(`OCR failed for ${relPath}:`, (err as Error).message);
return '';
} finally {
if (tempPath && fs.existsSync(tempPath)) {
fs.unlinkSync(tempPath);
}
}
}

View File

@@ -10,6 +10,7 @@ export interface Meme {
height: number;
parent_id: string | null;
collection_id: number | null;
ocr_text: string | null;
created_at: string;
tags: string[];
}