feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions
--- a/job_crawler/app/services/progress_store.py
+++ b/job_crawler/app/services/progress_store.py
@@ -0,0 +1,95 @@
+"""采集进度存储"""
+import sqlite3
+import os
+import logging
+from datetime import datetime
+from typing import Optional
+from contextlib import contextmanager
+from app.models import CrawlProgress
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class ProgressStore:
+    """采集进度存储（SQLite）"""
+    
+    def __init__(self, db_path: str = None):
+        self.db_path = db_path or settings.database.path
+        os.makedirs(os.path.dirname(self.db_path) or ".", exist_ok=True)
+        self._init_db()
+    
+    def _init_db(self):
+        """初始化数据库"""
+        with self._get_conn() as conn:
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS crawl_progress (
+                    task_id TEXT PRIMARY KEY,
+                    current_offset INTEGER DEFAULT 0,
+                    total INTEGER DEFAULT 0,
+                    last_update TEXT,
+                    status TEXT DEFAULT 'idle',
+                    filtered_count INTEGER DEFAULT 0,
+                    produced_count INTEGER DEFAULT 0
+                )
+            """)
+            conn.commit()
+    
+    @contextmanager
+    def _get_conn(self):
+        """获取数据库连接"""
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        try:
+            yield conn
+        finally:
+            conn.close()
+
+    def get_progress(self, task_id: str) -> Optional[CrawlProgress]:
+        """获取采集进度"""
+        with self._get_conn() as conn:
+            cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
+            row = cursor.fetchone()
+            if row:
+                return CrawlProgress(
+                    task_id=row["task_id"],
+                    current_offset=row["current_offset"],
+                    total=row["total"],
+                    last_update=row["last_update"] or "",
+                    status=row["status"]
+                )
+            return None
+    
+    def save_progress(self, task_id: str, offset: int, total: int, 
+                      status: str = "running", filtered_count: int = 0, produced_count: int = 0):
+        """保存采集进度"""
+        now = datetime.now().isoformat()
+        with self._get_conn() as conn:
+            conn.execute("""
+                INSERT INTO crawl_progress 
+                    (task_id, current_offset, total, last_update, status, filtered_count, produced_count)
+                VALUES (?, ?, ?, ?, ?, ?, ?)
+                ON CONFLICT(task_id) DO UPDATE SET
+                    current_offset = excluded.current_offset, total = excluded.total,
+                    last_update = excluded.last_update, status = excluded.status,
+                    filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
+            """, (task_id, offset, total, now, status, filtered_count, produced_count))
+            conn.commit()
+    
+    def get_stats(self, task_id: str) -> dict:
+        """获取统计信息"""
+        with self._get_conn() as conn:
+            cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
+            row = cursor.fetchone()
+            if row:
+                return dict(row)
+            return {}
+    
+    def reset_progress(self, task_id: str):
+        """重置采集进度"""
+        with self._get_conn() as conn:
+            conn.execute("DELETE FROM crawl_progress WHERE task_id = ?", (task_id,))
+            conn.commit()
+
+
+progress_store = ProgressStore()