feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions
--- a/job_crawler/app/services/crawler.py
+++ b/job_crawler/app/services/crawler.py
@@ -0,0 +1,209 @@
+"""多任务增量采集核心逻辑"""
+import asyncio
+import logging
+from typing import Dict, Optional
+from concurrent.futures import ThreadPoolExecutor
+from app.services.api_client import api_client
+from app.services.kafka_service import kafka_service
+from app.services.progress_store import progress_store
+from app.utils import is_within_days
+from app.models import JobData
+from app.core.config import settings, TaskConfig
+
+logger = logging.getLogger(__name__)
+
+
+class TaskCrawler:
+    """单个任务采集器"""
+    
+    def __init__(self, task_config: TaskConfig):
+        self.task_id = task_config.id
+        self.task_name = task_config.name or task_config.id
+        self.batch_size = settings.api.batch_size
+        self.filter_days = settings.crawler.filter_days
+        self._running = False
+        self._total_filtered = 0
+        self._total_produced = 0
+    
+    @property
+    def is_running(self) -> bool:
+        return self._running
+    
+    async def start(self, reset: bool = False):
+        """开始采集"""
+        if self._running:
+            logger.warning(f"[{self.task_name}] 任务已在运行中")
+            return
+        
+        self._running = True
+        self._total_filtered = 0
+        self._total_produced = 0
+        logger.info(f"[{self.task_name}] 开始采集任务")
+
+        try:
+            if reset:
+                progress_store.reset_progress(self.task_id)
+                current_offset = 0
+            else:
+                progress = progress_store.get_progress(self.task_id)
+                current_offset = progress.current_offset if progress else 0
+            
+            total = await api_client.get_total_count(self.task_id)
+            logger.info(f"[{self.task_name}] 数据总数: {total}, 当前偏移: {current_offset}")
+            
+            if current_offset >= total:
+                logger.info(f"[{self.task_name}] 数据已全部采集完成")
+                progress_store.save_progress(self.task_id, current_offset, total, "completed",
+                                            self._total_filtered, self._total_produced)
+                self._running = False
+                return
+            
+            while current_offset < total and self._running:
+                try:
+                    await self._crawl_batch(current_offset)
+                    current_offset += self.batch_size
+                    progress_store.save_progress(self.task_id, current_offset, total, "running",
+                                                self._total_filtered, self._total_produced)
+                    progress_pct = min(100, current_offset / total * 100)
+                    logger.info(f"[{self.task_name}] 进度: {progress_pct:.2f}% ({current_offset}/{total})")
+                    await asyncio.sleep(0.5)
+                except Exception as e:
+                    logger.error(f"[{self.task_name}] 采集批次失败: {e}")
+                    await asyncio.sleep(5)
+            
+            status = "completed" if current_offset >= total else "stopped"
+            progress_store.save_progress(self.task_id, current_offset, total, status,
+                                        self._total_filtered, self._total_produced)
+            logger.info(f"[{self.task_name}] 采集任务 {status}")
+        except Exception as e:
+            logger.error(f"[{self.task_name}] 采集任务异常: {e}")
+            progress_store.save_progress(self.task_id, 0, 0, "error", 
+                                        self._total_filtered, self._total_produced)
+        finally:
+            self._running = False
+
+    async def _crawl_batch(self, offset: int):
+        """采集一批数据"""
+        result = await api_client.fetch_data(self.task_id, offset, self.batch_size)
+        data_list = result.get("data", {}).get("data", [])
+        if not data_list:
+            return
+        
+        filtered_jobs = []
+        for raw in data_list:
+            aae397 = raw.get("aae397", "")
+            collect_time = raw.get("Collect_time", "")
+            if is_within_days(aae397, collect_time, self.filter_days):
+                job = JobData.from_raw(raw)
+                job.task_id = self.task_id  # 添加任务ID标识
+                filtered_jobs.append(job)
+        
+        self._total_filtered += len(filtered_jobs)
+        if filtered_jobs:
+            produced = kafka_service.produce_batch(filtered_jobs)
+            self._total_produced += produced
+    
+    def stop(self):
+        """停止采集"""
+        logger.info(f"[{self.task_name}] 正在停止采集任务...")
+        self._running = False
+    
+    def get_status(self) -> dict:
+        """获取采集状态"""
+        stats = progress_store.get_stats(self.task_id)
+        if not stats:
+            return {
+                "task_id": self.task_id, "task_name": self.task_name,
+                "total": 0, "current_offset": 0, "progress": "0%",
+                "status": "idle", "last_update": "",
+                "filtered_count": 0, "produced_count": 0
+            }
+        total = stats.get("total", 0)
+        current = stats.get("current_offset", 0)
+        progress = f"{min(100, current / total * 100):.2f}%" if total > 0 else "0%"
+        return {
+            "task_id": self.task_id, "task_name": self.task_name,
+            "total": total, "current_offset": current, "progress": progress,
+            "status": stats.get("status", "idle"), "last_update": stats.get("last_update", ""),
+            "filtered_count": stats.get("filtered_count", 0),
+            "produced_count": stats.get("produced_count", 0)
+        }
+
+
+class CrawlerManager:
+    """多任务采集管理器"""
+    
+    def __init__(self):
+        self._crawlers: Dict[str, TaskCrawler] = {}
+        self._executor = ThreadPoolExecutor(max_workers=settings.crawler.max_workers)
+        self._init_crawlers()
+    
+    def _init_crawlers(self):
+        """初始化所有启用的任务采集器"""
+        for task in settings.get_enabled_tasks():
+            self._crawlers[task.id] = TaskCrawler(task)
+            logger.info(f"初始化任务采集器: {task.name} ({task.id})")
+    
+    def get_crawler(self, task_id: str) -> Optional[TaskCrawler]:
+        """获取指定任务的采集器"""
+        return self._crawlers.get(task_id)
+    
+    def get_all_crawlers(self) -> Dict[str, TaskCrawler]:
+        """获取所有采集器"""
+        return self._crawlers
+    
+    async def start_task(self, task_id: str, reset: bool = False) -> bool:
+        """启动单个任务"""
+        crawler = self._crawlers.get(task_id)
+        if not crawler:
+            logger.error(f"任务不存在: {task_id}")
+            return False
+        if crawler.is_running:
+            logger.warning(f"任务已在运行: {task_id}")
+            return False
+        asyncio.create_task(crawler.start(reset))
+        return True
+    
+    async def start_all(self, reset: bool = False):
+        """启动所有任务"""
+        tasks = []
+        for task_id, crawler in self._crawlers.items():
+            if not crawler.is_running:
+                tasks.append(crawler.start(reset))
+        if tasks:
+            await asyncio.gather(*tasks)
+    
+    def stop_task(self, task_id: str) -> bool:
+        """停止单个任务"""
+        crawler = self._crawlers.get(task_id)
+        if not crawler:
+            return False
+        crawler.stop()
+        return True
+    
+    def stop_all(self):
+        """停止所有任务"""
+        for crawler in self._crawlers.values():
+            crawler.stop()
+    
+    def get_status(self, task_id: str = None) -> dict:
+        """获取状态"""
+        if task_id:
+            crawler = self._crawlers.get(task_id)
+            return crawler.get_status() if crawler else {}
+        
+        # 返回所有任务状态
+        return {
+            "tasks": [c.get_status() for c in self._crawlers.values()],
+            "kafka_lag": kafka_service.get_lag(),
+            "running_count": sum(1 for c in self._crawlers.values() if c.is_running)
+        }
+    
+    @property
+    def is_any_running(self) -> bool:
+        """是否有任务在运行"""
+        return any(c.is_running for c in self._crawlers.values())
+
+
+# 全局管理器实例
+crawler_manager = CrawlerManager()