feat(job_crawler): implement reverse-order incremental crawling with real-time Kafka publishing

- Add comprehensive sequence diagrams documenting container startup, task initialization, and incremental crawling flow
- Implement reverse-order crawling logic (from latest to oldest) to optimize performance by processing new data first
- Add real-time Kafka message publishing after each batch filtering instead of waiting for task completion
- Update progress tracking to store last_start_offset for accurate incremental crawling across sessions
- Enhance crawler service with improved offset calculation and batch processing logic
- Update configuration files to support new crawling parameters and Kafka integration
- Add progress model enhancements to track crawling state and handle edge cases
- Improve main application initialization to properly handle lifespan events and task auto-start
This change enables efficient incremental data collection where new data is prioritized and published immediately, reducing latency and improving system responsiveness.
This commit is contained in:
2026-01-15 17:46:55 +08:00
parent 63cd432a0c
commit 3acc0a9221
8 changed files with 402 additions and 60 deletions

View File

@@ -25,7 +25,7 @@ class ProgressStore:
conn.execute("""
CREATE TABLE IF NOT EXISTS crawl_progress (
task_id TEXT PRIMARY KEY,
current_offset INTEGER DEFAULT 0,
last_start_offset INTEGER,
total INTEGER DEFAULT 0,
last_update TEXT,
status TEXT DEFAULT 'idle',
@@ -53,27 +53,27 @@ class ProgressStore:
if row:
return CrawlProgress(
task_id=row["task_id"],
current_offset=row["current_offset"],
last_start_offset=row["last_start_offset"],
total=row["total"],
last_update=row["last_update"] or "",
status=row["status"]
)
return None
def save_progress(self, task_id: str, offset: int, total: int,
def save_progress(self, task_id: str, last_start_offset: int, total: int,
status: str = "running", filtered_count: int = 0, produced_count: int = 0):
"""保存采集进度"""
now = datetime.now().isoformat()
with self._get_conn() as conn:
conn.execute("""
INSERT INTO crawl_progress
(task_id, current_offset, total, last_update, status, filtered_count, produced_count)
(task_id, last_start_offset, total, last_update, status, filtered_count, produced_count)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(task_id) DO UPDATE SET
current_offset = excluded.current_offset, total = excluded.total,
last_start_offset = excluded.last_start_offset, total = excluded.total,
last_update = excluded.last_update, status = excluded.status,
filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
""", (task_id, offset, total, now, status, filtered_count, produced_count))
""", (task_id, last_start_offset, total, now, status, filtered_count, produced_count))
conn.commit()
def get_stats(self, task_id: str) -> dict: