feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions
--- a/job_crawler/app/core/init.py
+++ b/job_crawler/app/core/init.py
@@ -0,0 +1,5 @@
+"""核心模块"""
+from .config import settings
+from .logging import setup_logging
+
+__all__ = ["settings", "setup_logging"]
--- a/job_crawler/app/core/config.py
+++ b/job_crawler/app/core/config.py
@@ -0,0 +1,89 @@
+"""配置管理"""
+import os
+import yaml
+from typing import Optional, List
+from pydantic import BaseModel
+from functools import lru_cache
+
+
+class AppConfig(BaseModel):
+    name: str = "job-crawler"
+    version: str = "1.0.0"
+    debug: bool = False
+
+
+class TaskConfig(BaseModel):
+    """单个任务配置"""
+    id: str
+    name: str = ""
+    enabled: bool = True
+
+
+class ApiConfig(BaseModel):
+    base_url: str = "https://openapi.bazhuayu.com"
+    username: str = ""
+    password: str = ""
+    batch_size: int = 100
+    tasks: List[TaskConfig] = []
+
+
+class KafkaConfig(BaseModel):
+    bootstrap_servers: str = "localhost:9092"
+    topic: str = "job_data"
+    consumer_group: str = "job_consumer_group"
+
+
+class CrawlerConfig(BaseModel):
+    interval: int = 300
+    filter_days: int = 7
+    max_workers: int = 5
+
+
+class DatabaseConfig(BaseModel):
+    path: str = "data/crawl_progress.db"
+
+
+class Settings(BaseModel):
+    """应用配置"""
+    app: AppConfig = AppConfig()
+    api: ApiConfig = ApiConfig()
+    kafka: KafkaConfig = KafkaConfig()
+    crawler: CrawlerConfig = CrawlerConfig()
+    database: DatabaseConfig = DatabaseConfig()
+    
+    @classmethod
+    def from_yaml(cls, config_path: str) -> "Settings":
+        """从YAML文件加载配置"""
+        if not os.path.exists(config_path):
+            return cls()
+        
+        with open(config_path, 'r', encoding='utf-8') as f:
+            data = yaml.safe_load(f) or {}
+        
+        # 解析tasks
+        api_data = data.get('api', {})
+        tasks_data = api_data.pop('tasks', [])
+        tasks = [TaskConfig(**t) for t in tasks_data]
+        api_config = ApiConfig(**api_data, tasks=tasks)
+        
+        return cls(
+            app=AppConfig(**data.get('app', {})),
+            api=api_config,
+            kafka=KafkaConfig(**data.get('kafka', {})),
+            crawler=CrawlerConfig(**data.get('crawler', {})),
+            database=DatabaseConfig(**data.get('database', {}))
+        )
+    
+    def get_enabled_tasks(self) -> List[TaskConfig]:
+        """获取启用的任务列表"""
+        return [t for t in self.api.tasks if t.enabled]
+
+
+@lru_cache()
+def get_settings() -> Settings:
+    """获取配置"""
+    config_path = os.environ.get("CONFIG_PATH", "config/config.yml")
+    return Settings.from_yaml(config_path)
+
+
+settings = get_settings()
--- a/job_crawler/app/core/logging.py
+++ b/job_crawler/app/core/logging.py
@@ -0,0 +1,22 @@
+"""日志配置"""
+import logging
+import sys
+from .config import settings
+
+
+def setup_logging():
+    """配置日志"""
+    level = logging.DEBUG if settings.app.debug else logging.INFO
+    
+    logging.basicConfig(
+        level=level,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+    
+    # 降低第三方库日志级别
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("kafka").setLevel(logging.WARNING)
+    logging.getLogger("uvicorn").setLevel(logging.INFO)