- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
96 lines
3.5 KiB
Python
96 lines
3.5 KiB
Python
"""采集进度存储"""
|
||
import sqlite3
|
||
import os
|
||
import logging
|
||
from datetime import datetime
|
||
from typing import Optional
|
||
from contextlib import contextmanager
|
||
from app.models import CrawlProgress
|
||
from app.core.config import settings
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class ProgressStore:
|
||
"""采集进度存储(SQLite)"""
|
||
|
||
def __init__(self, db_path: str = None):
|
||
self.db_path = db_path or settings.database.path
|
||
os.makedirs(os.path.dirname(self.db_path) or ".", exist_ok=True)
|
||
self._init_db()
|
||
|
||
def _init_db(self):
|
||
"""初始化数据库"""
|
||
with self._get_conn() as conn:
|
||
conn.execute("""
|
||
CREATE TABLE IF NOT EXISTS crawl_progress (
|
||
task_id TEXT PRIMARY KEY,
|
||
current_offset INTEGER DEFAULT 0,
|
||
total INTEGER DEFAULT 0,
|
||
last_update TEXT,
|
||
status TEXT DEFAULT 'idle',
|
||
filtered_count INTEGER DEFAULT 0,
|
||
produced_count INTEGER DEFAULT 0
|
||
)
|
||
""")
|
||
conn.commit()
|
||
|
||
@contextmanager
|
||
def _get_conn(self):
|
||
"""获取数据库连接"""
|
||
conn = sqlite3.connect(self.db_path)
|
||
conn.row_factory = sqlite3.Row
|
||
try:
|
||
yield conn
|
||
finally:
|
||
conn.close()
|
||
|
||
def get_progress(self, task_id: str) -> Optional[CrawlProgress]:
|
||
"""获取采集进度"""
|
||
with self._get_conn() as conn:
|
||
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||
row = cursor.fetchone()
|
||
if row:
|
||
return CrawlProgress(
|
||
task_id=row["task_id"],
|
||
current_offset=row["current_offset"],
|
||
total=row["total"],
|
||
last_update=row["last_update"] or "",
|
||
status=row["status"]
|
||
)
|
||
return None
|
||
|
||
def save_progress(self, task_id: str, offset: int, total: int,
|
||
status: str = "running", filtered_count: int = 0, produced_count: int = 0):
|
||
"""保存采集进度"""
|
||
now = datetime.now().isoformat()
|
||
with self._get_conn() as conn:
|
||
conn.execute("""
|
||
INSERT INTO crawl_progress
|
||
(task_id, current_offset, total, last_update, status, filtered_count, produced_count)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||
ON CONFLICT(task_id) DO UPDATE SET
|
||
current_offset = excluded.current_offset, total = excluded.total,
|
||
last_update = excluded.last_update, status = excluded.status,
|
||
filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
|
||
""", (task_id, offset, total, now, status, filtered_count, produced_count))
|
||
conn.commit()
|
||
|
||
def get_stats(self, task_id: str) -> dict:
|
||
"""获取统计信息"""
|
||
with self._get_conn() as conn:
|
||
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||
row = cursor.fetchone()
|
||
if row:
|
||
return dict(row)
|
||
return {}
|
||
|
||
def reset_progress(self, task_id: str):
|
||
"""重置采集进度"""
|
||
with self._get_conn() as conn:
|
||
conn.execute("DELETE FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||
conn.commit()
|
||
|
||
|
||
progress_store = ProgressStore()
|