- Add comprehensive sequence diagrams documenting container startup, task initialization, and incremental crawling flow - Implement reverse-order crawling logic (from latest to oldest) to optimize performance by processing new data first - Add real-time Kafka message publishing after each batch filtering instead of waiting for task completion - Update progress tracking to store last_start_offset for accurate incremental crawling across sessions - Enhance crawler service with improved offset calculation and batch processing logic - Update configuration files to support new crawling parameters and Kafka integration - Add progress model enhancements to track crawling state and handle edge cases - Improve main application initialization to properly handle lifespan events and task auto-start This change enables efficient incremental data collection where new data is prioritized and published immediately, reducing latency and improving system responsiveness.
47 lines
1.2 KiB
Python
47 lines
1.2 KiB
Python
"""FastAPI应用入口"""
|
||
import asyncio
|
||
import logging
|
||
from contextlib import asynccontextmanager
|
||
from fastapi import FastAPI
|
||
from app.core.config import settings
|
||
from app.core.logging import setup_logging
|
||
from app.api import router
|
||
from app.services import kafka_service
|
||
|
||
setup_logging()
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@asynccontextmanager
|
||
async def lifespan(app: FastAPI):
|
||
"""应用生命周期管理"""
|
||
logger.info("服务启动中...")
|
||
|
||
# 自动启动所有采集任务
|
||
if settings.crawler.auto_start:
|
||
from app.services import crawler_manager
|
||
logger.info("自动启动采集任务...")
|
||
asyncio.create_task(crawler_manager.start_all())
|
||
|
||
yield
|
||
|
||
logger.info("服务关闭中...")
|
||
from app.services import crawler_manager
|
||
crawler_manager.stop_all()
|
||
kafka_service.close()
|
||
|
||
|
||
app = FastAPI(
|
||
title="招聘数据采集服务",
|
||
description="从八爪鱼API采集招聘数据,通过Kafka提供消费接口",
|
||
version=settings.app.version,
|
||
lifespan=lifespan
|
||
)
|
||
|
||
app.include_router(router)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
|