- Add comprehensive sequence diagrams documenting container startup, task initialization, and incremental crawling flow - Implement reverse-order crawling logic (from latest to oldest) to optimize performance by processing new data first - Add real-time Kafka message publishing after each batch filtering instead of waiting for task completion - Update progress tracking to store last_start_offset for accurate incremental crawling across sessions - Enhance crawler service with improved offset calculation and batch processing logic - Update configuration files to support new crawling parameters and Kafka integration - Add progress model enhancements to track crawling state and handle edge cases - Improve main application initialization to properly handle lifespan events and task auto-start This change enables efficient incremental data collection where new data is prioritized and published immediately, reducing latency and improving system responsiveness.
92 lines
2.4 KiB
Python
92 lines
2.4 KiB
Python
"""配置管理"""
|
|
import os
|
|
import yaml
|
|
from typing import Optional, List
|
|
from pydantic import BaseModel
|
|
from functools import lru_cache
|
|
|
|
|
|
class AppConfig(BaseModel):
|
|
name: str = "job-crawler"
|
|
version: str = "1.0.0"
|
|
debug: bool = False
|
|
|
|
|
|
class TaskConfig(BaseModel):
|
|
"""单个任务配置"""
|
|
id: str
|
|
name: str = ""
|
|
enabled: bool = True
|
|
|
|
|
|
class ApiConfig(BaseModel):
|
|
base_url: str = "https://openapi.bazhuayu.com"
|
|
username: str = ""
|
|
password: str = ""
|
|
batch_size: int = 100
|
|
tasks: List[TaskConfig] = []
|
|
|
|
|
|
class KafkaConfig(BaseModel):
|
|
bootstrap_servers: str = "localhost:9092"
|
|
topic: str = "job_data"
|
|
consumer_group: str = "job_consumer_group"
|
|
|
|
|
|
class CrawlerConfig(BaseModel):
|
|
interval: int = 300
|
|
filter_days: int = 7
|
|
max_workers: int = 5
|
|
max_expired_batches: int = 3 # 连续过期批次阈值
|
|
auto_start: bool = True # 容器启动时自动开始采集
|
|
|
|
|
|
class DatabaseConfig(BaseModel):
|
|
path: str = "data/crawl_progress.db"
|
|
|
|
|
|
class Settings(BaseModel):
|
|
"""应用配置"""
|
|
app: AppConfig = AppConfig()
|
|
api: ApiConfig = ApiConfig()
|
|
kafka: KafkaConfig = KafkaConfig()
|
|
crawler: CrawlerConfig = CrawlerConfig()
|
|
database: DatabaseConfig = DatabaseConfig()
|
|
|
|
@classmethod
|
|
def from_yaml(cls, config_path: str) -> "Settings":
|
|
"""从YAML文件加载配置"""
|
|
if not os.path.exists(config_path):
|
|
return cls()
|
|
|
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f) or {}
|
|
|
|
# 解析tasks
|
|
api_data = data.get('api', {})
|
|
tasks_data = api_data.pop('tasks', [])
|
|
tasks = [TaskConfig(**t) for t in tasks_data]
|
|
api_config = ApiConfig(**api_data, tasks=tasks)
|
|
|
|
return cls(
|
|
app=AppConfig(**data.get('app', {})),
|
|
api=api_config,
|
|
kafka=KafkaConfig(**data.get('kafka', {})),
|
|
crawler=CrawlerConfig(**data.get('crawler', {})),
|
|
database=DatabaseConfig(**data.get('database', {}))
|
|
)
|
|
|
|
def get_enabled_tasks(self) -> List[TaskConfig]:
|
|
"""获取启用的任务列表"""
|
|
return [t for t in self.api.tasks if t.enabled]
|
|
|
|
|
|
@lru_cache()
|
|
def get_settings() -> Settings:
|
|
"""获取配置"""
|
|
config_path = os.environ.get("CONFIG_PATH", "config/config.yml")
|
|
return Settings.from_yaml(config_path)
|
|
|
|
|
|
settings = get_settings()
|