feat(job_crawler): implement reverse-order incremental crawling with real-time Kafka publishing
- Add comprehensive sequence diagrams documenting container startup, task initialization, and incremental crawling flow - Implement reverse-order crawling logic (from latest to oldest) to optimize performance by processing new data first - Add real-time Kafka message publishing after each batch filtering instead of waiting for task completion - Update progress tracking to store last_start_offset for accurate incremental crawling across sessions - Enhance crawler service with improved offset calculation and batch processing logic - Update configuration files to support new crawling parameters and Kafka integration - Add progress model enhancements to track crawling state and handle edge cases - Improve main application initialization to properly handle lifespan events and task auto-start This change enables efficient incremental data collection where new data is prioritized and published immediately, reducing latency and improving system responsiveness.
This commit is contained in:
@@ -25,7 +25,7 @@ class ProgressStore:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS crawl_progress (
|
||||
task_id TEXT PRIMARY KEY,
|
||||
current_offset INTEGER DEFAULT 0,
|
||||
last_start_offset INTEGER,
|
||||
total INTEGER DEFAULT 0,
|
||||
last_update TEXT,
|
||||
status TEXT DEFAULT 'idle',
|
||||
@@ -53,27 +53,27 @@ class ProgressStore:
|
||||
if row:
|
||||
return CrawlProgress(
|
||||
task_id=row["task_id"],
|
||||
current_offset=row["current_offset"],
|
||||
last_start_offset=row["last_start_offset"],
|
||||
total=row["total"],
|
||||
last_update=row["last_update"] or "",
|
||||
status=row["status"]
|
||||
)
|
||||
return None
|
||||
|
||||
def save_progress(self, task_id: str, offset: int, total: int,
|
||||
def save_progress(self, task_id: str, last_start_offset: int, total: int,
|
||||
status: str = "running", filtered_count: int = 0, produced_count: int = 0):
|
||||
"""保存采集进度"""
|
||||
now = datetime.now().isoformat()
|
||||
with self._get_conn() as conn:
|
||||
conn.execute("""
|
||||
INSERT INTO crawl_progress
|
||||
(task_id, current_offset, total, last_update, status, filtered_count, produced_count)
|
||||
(task_id, last_start_offset, total, last_update, status, filtered_count, produced_count)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(task_id) DO UPDATE SET
|
||||
current_offset = excluded.current_offset, total = excluded.total,
|
||||
last_start_offset = excluded.last_start_offset, total = excluded.total,
|
||||
last_update = excluded.last_update, status = excluded.status,
|
||||
filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
|
||||
""", (task_id, offset, total, now, status, filtered_count, produced_count))
|
||||
""", (task_id, last_start_offset, total, now, status, filtered_count, produced_count))
|
||||
conn.commit()
|
||||
|
||||
def get_stats(self, task_id: str) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user