feat(job_crawler): initialize job crawler service with kafka integration
- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
This commit is contained in:
95
job_crawler/app/services/progress_store.py
Normal file
95
job_crawler/app/services/progress_store.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""采集进度存储"""
|
||||
import sqlite3
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from contextlib import contextmanager
|
||||
from app.models import CrawlProgress
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProgressStore:
|
||||
"""采集进度存储(SQLite)"""
|
||||
|
||||
def __init__(self, db_path: str = None):
|
||||
self.db_path = db_path or settings.database.path
|
||||
os.makedirs(os.path.dirname(self.db_path) or ".", exist_ok=True)
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self):
|
||||
"""初始化数据库"""
|
||||
with self._get_conn() as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS crawl_progress (
|
||||
task_id TEXT PRIMARY KEY,
|
||||
current_offset INTEGER DEFAULT 0,
|
||||
total INTEGER DEFAULT 0,
|
||||
last_update TEXT,
|
||||
status TEXT DEFAULT 'idle',
|
||||
filtered_count INTEGER DEFAULT 0,
|
||||
produced_count INTEGER DEFAULT 0
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
@contextmanager
|
||||
def _get_conn(self):
|
||||
"""获取数据库连接"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_progress(self, task_id: str) -> Optional[CrawlProgress]:
|
||||
"""获取采集进度"""
|
||||
with self._get_conn() as conn:
|
||||
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return CrawlProgress(
|
||||
task_id=row["task_id"],
|
||||
current_offset=row["current_offset"],
|
||||
total=row["total"],
|
||||
last_update=row["last_update"] or "",
|
||||
status=row["status"]
|
||||
)
|
||||
return None
|
||||
|
||||
def save_progress(self, task_id: str, offset: int, total: int,
|
||||
status: str = "running", filtered_count: int = 0, produced_count: int = 0):
|
||||
"""保存采集进度"""
|
||||
now = datetime.now().isoformat()
|
||||
with self._get_conn() as conn:
|
||||
conn.execute("""
|
||||
INSERT INTO crawl_progress
|
||||
(task_id, current_offset, total, last_update, status, filtered_count, produced_count)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(task_id) DO UPDATE SET
|
||||
current_offset = excluded.current_offset, total = excluded.total,
|
||||
last_update = excluded.last_update, status = excluded.status,
|
||||
filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
|
||||
""", (task_id, offset, total, now, status, filtered_count, produced_count))
|
||||
conn.commit()
|
||||
|
||||
def get_stats(self, task_id: str) -> dict:
|
||||
"""获取统计信息"""
|
||||
with self._get_conn() as conn:
|
||||
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return dict(row)
|
||||
return {}
|
||||
|
||||
def reset_progress(self, task_id: str):
|
||||
"""重置采集进度"""
|
||||
with self._get_conn() as conn:
|
||||
conn.execute("DELETE FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||||
conn.commit()
|
||||
|
||||
|
||||
progress_store = ProgressStore()
|
||||
Reference in New Issue
Block a user