feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details
- Create FastAPI application structure with modular organization (api, core, models, services, utils)
- Implement job data crawler service with incremental collection from third-party API
- Add Kafka service integration with Docker Compose configuration for message queue
- Create data models for job listings, progress tracking, and API responses
- Implement REST API endpoints for data consumption (/consume, /status) and task management
- Add progress persistence layer using SQLite for tracking collection offsets
- Implement date filtering logic to extract data published within 7 days
- Create API client service for third-party data source integration
- Add configuration management with environment-based settings
- Include Docker support with Dockerfile and docker-compose.yml for containerized deployment
- Add logging configuration and utility functions for date parsing
- Include requirements.txt with all Python dependencies and README documentation
This commit is contained in:
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions

View File

@@ -0,0 +1,95 @@
"""采集进度存储"""
import sqlite3
import os
import logging
from datetime import datetime
from typing import Optional
from contextlib import contextmanager
from app.models import CrawlProgress
from app.core.config import settings
logger = logging.getLogger(__name__)
class ProgressStore:
"""采集进度存储SQLite"""
def __init__(self, db_path: str = None):
self.db_path = db_path or settings.database.path
os.makedirs(os.path.dirname(self.db_path) or ".", exist_ok=True)
self._init_db()
def _init_db(self):
"""初始化数据库"""
with self._get_conn() as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS crawl_progress (
task_id TEXT PRIMARY KEY,
current_offset INTEGER DEFAULT 0,
total INTEGER DEFAULT 0,
last_update TEXT,
status TEXT DEFAULT 'idle',
filtered_count INTEGER DEFAULT 0,
produced_count INTEGER DEFAULT 0
)
""")
conn.commit()
@contextmanager
def _get_conn(self):
"""获取数据库连接"""
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def get_progress(self, task_id: str) -> Optional[CrawlProgress]:
"""获取采集进度"""
with self._get_conn() as conn:
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
row = cursor.fetchone()
if row:
return CrawlProgress(
task_id=row["task_id"],
current_offset=row["current_offset"],
total=row["total"],
last_update=row["last_update"] or "",
status=row["status"]
)
return None
def save_progress(self, task_id: str, offset: int, total: int,
status: str = "running", filtered_count: int = 0, produced_count: int = 0):
"""保存采集进度"""
now = datetime.now().isoformat()
with self._get_conn() as conn:
conn.execute("""
INSERT INTO crawl_progress
(task_id, current_offset, total, last_update, status, filtered_count, produced_count)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(task_id) DO UPDATE SET
current_offset = excluded.current_offset, total = excluded.total,
last_update = excluded.last_update, status = excluded.status,
filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
""", (task_id, offset, total, now, status, filtered_count, produced_count))
conn.commit()
def get_stats(self, task_id: str) -> dict:
"""获取统计信息"""
with self._get_conn() as conn:
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
row = cursor.fetchone()
if row:
return dict(row)
return {}
def reset_progress(self, task_id: str):
"""重置采集进度"""
with self._get_conn() as conn:
conn.execute("DELETE FROM crawl_progress WHERE task_id = ?", (task_id,))
conn.commit()
progress_store = ProgressStore()