feat(job_crawler): initialize job crawler service with kafka integration
- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
This commit is contained in:
5
job_crawler/app/core/__init__.py
Normal file
5
job_crawler/app/core/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""核心模块"""
|
||||
from .config import settings
|
||||
from .logging import setup_logging
|
||||
|
||||
__all__ = ["settings", "setup_logging"]
|
||||
89
job_crawler/app/core/config.py
Normal file
89
job_crawler/app/core/config.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""配置管理"""
|
||||
import os
|
||||
import yaml
|
||||
from typing import Optional, List
|
||||
from pydantic import BaseModel
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
class AppConfig(BaseModel):
|
||||
name: str = "job-crawler"
|
||||
version: str = "1.0.0"
|
||||
debug: bool = False
|
||||
|
||||
|
||||
class TaskConfig(BaseModel):
|
||||
"""单个任务配置"""
|
||||
id: str
|
||||
name: str = ""
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class ApiConfig(BaseModel):
|
||||
base_url: str = "https://openapi.bazhuayu.com"
|
||||
username: str = ""
|
||||
password: str = ""
|
||||
batch_size: int = 100
|
||||
tasks: List[TaskConfig] = []
|
||||
|
||||
|
||||
class KafkaConfig(BaseModel):
|
||||
bootstrap_servers: str = "localhost:9092"
|
||||
topic: str = "job_data"
|
||||
consumer_group: str = "job_consumer_group"
|
||||
|
||||
|
||||
class CrawlerConfig(BaseModel):
|
||||
interval: int = 300
|
||||
filter_days: int = 7
|
||||
max_workers: int = 5
|
||||
|
||||
|
||||
class DatabaseConfig(BaseModel):
|
||||
path: str = "data/crawl_progress.db"
|
||||
|
||||
|
||||
class Settings(BaseModel):
|
||||
"""应用配置"""
|
||||
app: AppConfig = AppConfig()
|
||||
api: ApiConfig = ApiConfig()
|
||||
kafka: KafkaConfig = KafkaConfig()
|
||||
crawler: CrawlerConfig = CrawlerConfig()
|
||||
database: DatabaseConfig = DatabaseConfig()
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, config_path: str) -> "Settings":
|
||||
"""从YAML文件加载配置"""
|
||||
if not os.path.exists(config_path):
|
||||
return cls()
|
||||
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
|
||||
# 解析tasks
|
||||
api_data = data.get('api', {})
|
||||
tasks_data = api_data.pop('tasks', [])
|
||||
tasks = [TaskConfig(**t) for t in tasks_data]
|
||||
api_config = ApiConfig(**api_data, tasks=tasks)
|
||||
|
||||
return cls(
|
||||
app=AppConfig(**data.get('app', {})),
|
||||
api=api_config,
|
||||
kafka=KafkaConfig(**data.get('kafka', {})),
|
||||
crawler=CrawlerConfig(**data.get('crawler', {})),
|
||||
database=DatabaseConfig(**data.get('database', {}))
|
||||
)
|
||||
|
||||
def get_enabled_tasks(self) -> List[TaskConfig]:
|
||||
"""获取启用的任务列表"""
|
||||
return [t for t in self.api.tasks if t.enabled]
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_settings() -> Settings:
|
||||
"""获取配置"""
|
||||
config_path = os.environ.get("CONFIG_PATH", "config/config.yml")
|
||||
return Settings.from_yaml(config_path)
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
22
job_crawler/app/core/logging.py
Normal file
22
job_crawler/app/core/logging.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""日志配置"""
|
||||
import logging
|
||||
import sys
|
||||
from .config import settings
|
||||
|
||||
|
||||
def setup_logging():
|
||||
"""配置日志"""
|
||||
level = logging.DEBUG if settings.app.debug else logging.INFO
|
||||
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
|
||||
# 降低第三方库日志级别
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("kafka").setLevel(logging.WARNING)
|
||||
logging.getLogger("uvicorn").setLevel(logging.INFO)
|
||||
Reference in New Issue
Block a user