rabbitmq
This commit is contained in:
@@ -5,7 +5,7 @@ from typing import Optional
|
||||
from fastapi import APIRouter, Query, BackgroundTasks, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from app.models import ApiResponse, ConsumeResponse, StatusResponse
|
||||
from app.services import crawler_manager, kafka_service
|
||||
from app.services import crawler_manager, rabbitmq_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
@@ -84,29 +84,18 @@ async def stop_crawl(
|
||||
|
||||
@router.get("/consume", response_model=ConsumeResponse)
|
||||
async def consume_data(
|
||||
batch_size: int = Query(10, ge=1, le=100, description="批量大小"),
|
||||
timeout: int = Query(5000, ge=1000, le=30000, description="超时时间(毫秒)")
|
||||
batch_size: int = Query(10, ge=1, le=100, description="批量大小")
|
||||
):
|
||||
"""消费Kafka数据"""
|
||||
"""消费RabbitMQ数据"""
|
||||
try:
|
||||
messages = kafka_service.consume(batch_size, timeout)
|
||||
messages = rabbitmq_service.consume(batch_size)
|
||||
return ConsumeResponse(data=messages, count=len(messages))
|
||||
except Exception as e:
|
||||
logger.error(f"消费数据失败: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/consume/stream")
|
||||
async def consume_stream():
|
||||
"""SSE流式消费"""
|
||||
async def event_generator():
|
||||
consumer = kafka_service.get_consumer()
|
||||
try:
|
||||
for message in consumer:
|
||||
data = json.dumps(message.value, ensure_ascii=False)
|
||||
yield f"data: {data}\n\n"
|
||||
except Exception as e:
|
||||
logger.error(f"流式消费错误: {e}")
|
||||
finally:
|
||||
consumer.close()
|
||||
return StreamingResponse(event_generator(), media_type="text/event-stream")
|
||||
@router.get("/queue/size")
|
||||
async def get_queue_size():
|
||||
"""获取队列消息数量"""
|
||||
return {"queue_size": rabbitmq_service.get_queue_size()}
|
||||
|
||||
@@ -27,10 +27,13 @@ class ApiConfig(BaseModel):
|
||||
tasks: List[TaskConfig] = []
|
||||
|
||||
|
||||
class KafkaConfig(BaseModel):
|
||||
bootstrap_servers: str = "localhost:9092"
|
||||
topic: str = "job_data"
|
||||
consumer_group: str = "job_consumer_group"
|
||||
class RabbitMQConfig(BaseModel):
|
||||
host: str = "localhost"
|
||||
port: int = 5672
|
||||
username: str = "guest"
|
||||
password: str = "guest"
|
||||
queue: str = "job_data"
|
||||
message_ttl: int = 604800000 # 7天(毫秒)
|
||||
|
||||
|
||||
class CrawlerConfig(BaseModel):
|
||||
@@ -49,7 +52,7 @@ class Settings(BaseModel):
|
||||
"""应用配置"""
|
||||
app: AppConfig = AppConfig()
|
||||
api: ApiConfig = ApiConfig()
|
||||
kafka: KafkaConfig = KafkaConfig()
|
||||
rabbitmq: RabbitMQConfig = RabbitMQConfig()
|
||||
crawler: CrawlerConfig = CrawlerConfig()
|
||||
database: DatabaseConfig = DatabaseConfig()
|
||||
|
||||
@@ -71,7 +74,7 @@ class Settings(BaseModel):
|
||||
return cls(
|
||||
app=AppConfig(**data.get('app', {})),
|
||||
api=api_config,
|
||||
kafka=KafkaConfig(**data.get('kafka', {})),
|
||||
rabbitmq=RabbitMQConfig(**data.get('rabbitmq', {})),
|
||||
crawler=CrawlerConfig(**data.get('crawler', {})),
|
||||
database=DatabaseConfig(**data.get('database', {}))
|
||||
)
|
||||
|
||||
@@ -6,7 +6,7 @@ from fastapi import FastAPI
|
||||
from app.core.config import settings
|
||||
from app.core.logging import setup_logging
|
||||
from app.api import router
|
||||
from app.services import kafka_service
|
||||
from app.services import rabbitmq_service
|
||||
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -28,7 +28,7 @@ async def lifespan(app: FastAPI):
|
||||
logger.info("服务关闭中...")
|
||||
from app.services import crawler_manager
|
||||
crawler_manager.stop_all()
|
||||
kafka_service.close()
|
||||
rabbitmq_service.close()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
|
||||
@@ -1,60 +1,24 @@
|
||||
"""招聘数据模型"""
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
|
||||
class JobData(BaseModel):
|
||||
"""招聘数据模型"""
|
||||
id: str = ""
|
||||
task_id: str = "" # 任务ID
|
||||
job_category: str = "" # Std_class - 职位分类
|
||||
job_title: str = "" # aca112 - 职位名称
|
||||
company: str = "" # AAB004 - 公司名称
|
||||
company_type: str = "" # AAB019 - 企业类型
|
||||
salary: str = "" # acb241 - 薪资范围
|
||||
location: str = "" # aab302 - 工作地点
|
||||
address: str = "" # AAE006 - 详细地址
|
||||
publish_date: str = "" # aae397 - 发布日期
|
||||
collect_time: str = "" # Collect_time - 采集时间
|
||||
url: str = "" # ACE760 - 职位链接
|
||||
description: str = "" # acb22a - 职位描述
|
||||
experience: str = "" # Experience - 经验要求
|
||||
education: str = "" # aac011 - 学历要求
|
||||
headcount: str = "" # acb240 - 招聘人数
|
||||
industry: str = "" # AAB022 - 行业
|
||||
company_size: str = "" # Num_employers - 公司规模
|
||||
contact: str = "" # AAE004 - 联系人
|
||||
company_intro: str = "" # AAB092 - 公司简介
|
||||
crawl_time: str = "" # 入库时间
|
||||
class JobData:
|
||||
"""招聘数据 - 保留原始数据格式"""
|
||||
|
||||
def __init__(self, **data):
|
||||
super().__init__(**data)
|
||||
if not self.id:
|
||||
self.id = str(uuid.uuid4())
|
||||
if not self.crawl_time:
|
||||
self.crawl_time = datetime.now().isoformat()
|
||||
def __init__(self, raw_data: dict, task_id: str = ""):
|
||||
self.raw_data = raw_data
|
||||
self.task_id = task_id
|
||||
# 添加元数据
|
||||
self.raw_data["_id"] = str(uuid.uuid4())
|
||||
self.raw_data["_task_id"] = task_id
|
||||
self.raw_data["_crawl_time"] = datetime.now().isoformat()
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""转换为字典(原始数据 + 元数据)"""
|
||||
return self.raw_data
|
||||
|
||||
@classmethod
|
||||
def from_raw(cls, raw: dict) -> "JobData":
|
||||
"""从原始API数据转换"""
|
||||
return cls(
|
||||
job_category=raw.get("Std_class", ""),
|
||||
job_title=raw.get("aca112", ""),
|
||||
company=raw.get("AAB004", ""),
|
||||
company_type=raw.get("AAB019", "").strip(),
|
||||
salary=raw.get("acb241", ""),
|
||||
location=raw.get("aab302", ""),
|
||||
address=raw.get("AAE006", ""),
|
||||
publish_date=raw.get("aae397", ""),
|
||||
collect_time=raw.get("Collect_time", ""),
|
||||
url=raw.get("ACE760", ""),
|
||||
description=raw.get("acb22a", ""),
|
||||
experience=raw.get("Experience", ""),
|
||||
education=raw.get("aac011", ""),
|
||||
headcount=raw.get("acb240", ""),
|
||||
industry=raw.get("AAB022", ""),
|
||||
company_size=raw.get("Num_employers", ""),
|
||||
contact=raw.get("AAE004", ""),
|
||||
company_intro=raw.get("AAB092", ""),
|
||||
)
|
||||
def from_raw(cls, raw: dict, task_id: str = "") -> "JobData":
|
||||
"""从原始API数据创建"""
|
||||
return cls(raw.copy(), task_id)
|
||||
|
||||
@@ -18,7 +18,7 @@ class CrawlStatus(BaseModel):
|
||||
total: int
|
||||
last_start_offset: Optional[int] = None
|
||||
progress: str
|
||||
kafka_lag: int = 0
|
||||
queue_size: int = 0
|
||||
status: str
|
||||
last_update: str
|
||||
filtered_count: int = 0
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
"""服务模块"""
|
||||
from .api_client import api_client, BazhuayuClient
|
||||
from .kafka_service import kafka_service, KafkaService
|
||||
from .rabbitmq_service import rabbitmq_service, RabbitMQService
|
||||
from .progress_store import progress_store, ProgressStore
|
||||
from .crawler import crawler_manager, CrawlerManager, TaskCrawler
|
||||
|
||||
__all__ = [
|
||||
"api_client", "BazhuayuClient",
|
||||
"kafka_service", "KafkaService",
|
||||
"rabbitmq_service", "RabbitMQService",
|
||||
"progress_store", "ProgressStore",
|
||||
"crawler_manager", "CrawlerManager", "TaskCrawler"
|
||||
]
|
||||
|
||||
@@ -4,7 +4,7 @@ import logging
|
||||
from typing import Dict, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from app.services.api_client import api_client
|
||||
from app.services.kafka_service import kafka_service
|
||||
from app.services.rabbitmq_service import rabbitmq_service
|
||||
from app.services.progress_store import progress_store
|
||||
from app.utils import is_within_days
|
||||
from app.models import JobData
|
||||
@@ -134,21 +134,20 @@ class TaskCrawler:
|
||||
aae397 = raw.get("aae397", "")
|
||||
collect_time = raw.get("Collect_time", "")
|
||||
if is_within_days(aae397, collect_time, self.filter_days):
|
||||
job = JobData.from_raw(raw)
|
||||
job.task_id = self.task_id
|
||||
job = JobData.from_raw(raw, self.task_id)
|
||||
filtered_jobs.append(job)
|
||||
|
||||
valid_count = len(filtered_jobs)
|
||||
expired_count = len(data_list) - valid_count
|
||||
self._total_filtered += valid_count
|
||||
|
||||
# 立即发送到Kafka
|
||||
# 立即发送到RabbitMQ
|
||||
produced = 0
|
||||
if filtered_jobs:
|
||||
produced = kafka_service.produce_batch(filtered_jobs)
|
||||
produced = rabbitmq_service.produce_batch(filtered_jobs)
|
||||
self._total_produced += produced
|
||||
|
||||
logger.info(f"[{self.task_name}] offset={offset}, 获取={len(data_list)}, 有效={valid_count}, 过期={expired_count}, 发送Kafka={produced}")
|
||||
logger.info(f"[{self.task_name}] offset={offset}, 获取={len(data_list)}, 有效={valid_count}, 过期={expired_count}, 发送MQ={produced}")
|
||||
|
||||
return valid_count
|
||||
|
||||
@@ -236,7 +235,7 @@ class CrawlerManager:
|
||||
return crawler.get_status() if crawler else {}
|
||||
return {
|
||||
"tasks": [c.get_status() for c in self._crawlers.values()],
|
||||
"kafka_lag": kafka_service.get_lag(),
|
||||
"queue_size": rabbitmq_service.get_queue_size(),
|
||||
"running_count": sum(1 for c in self._crawlers.values() if c.is_running)
|
||||
}
|
||||
|
||||
|
||||
@@ -66,7 +66,8 @@ class KafkaService:
|
||||
def produce(self, job_data: JobData) -> bool:
|
||||
"""发送消息到Kafka"""
|
||||
try:
|
||||
future = self.producer.send(self.topic, key=job_data.id, value=job_data.model_dump())
|
||||
data = job_data.to_dict()
|
||||
future = self.producer.send(self.topic, key=data.get("_id"), value=data)
|
||||
future.get(timeout=10)
|
||||
return True
|
||||
except KafkaError as e:
|
||||
|
||||
@@ -24,11 +24,14 @@ api:
|
||||
name: "任务3"
|
||||
enabled: false
|
||||
|
||||
# Kafka配置
|
||||
kafka:
|
||||
bootstrap_servers: kafka:29092
|
||||
topic: job_data
|
||||
consumer_group: job_consumer_group
|
||||
# RabbitMQ配置
|
||||
rabbitmq:
|
||||
host: rabbitmq
|
||||
port: 5672
|
||||
username: guest
|
||||
password: guest
|
||||
queue: job_data
|
||||
message_ttl: 604800000 # 消息过期时间:7天(毫秒)
|
||||
|
||||
# 采集配置
|
||||
crawler:
|
||||
|
||||
@@ -22,11 +22,14 @@ api:
|
||||
name: "任务2"
|
||||
enabled: false
|
||||
|
||||
# Kafka配置(Docker内部网络)
|
||||
kafka:
|
||||
bootstrap_servers: kafka:29092
|
||||
topic: job_data
|
||||
consumer_group: job_consumer_group
|
||||
# RabbitMQ配置
|
||||
rabbitmq:
|
||||
host: rabbitmq
|
||||
port: 5672
|
||||
username: guest
|
||||
password: guest
|
||||
queue: job_data
|
||||
message_ttl: 604800000 # 消息过期时间:7天(毫秒)
|
||||
|
||||
# 采集配置
|
||||
crawler:
|
||||
|
||||
@@ -1,51 +1,23 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
zookeeper:
|
||||
image: confluentinc/cp-zookeeper:7.5.0
|
||||
container_name: job-zookeeper
|
||||
rabbitmq:
|
||||
image: rabbitmq:3.12-management
|
||||
container_name: job-rabbitmq
|
||||
ports:
|
||||
- "2181:2181"
|
||||
- "5672:5672"
|
||||
- "15672:15672"
|
||||
environment:
|
||||
ZOOKEEPER_CLIENT_PORT: 2181
|
||||
ZOOKEEPER_TICK_TIME: 2000
|
||||
RABBITMQ_DEFAULT_USER: guest
|
||||
RABBITMQ_DEFAULT_PASS: guest
|
||||
volumes:
|
||||
- zookeeper_data:/var/lib/zookeeper/data
|
||||
- rabbitmq_data:/var/lib/rabbitmq
|
||||
healthcheck:
|
||||
test: ["CMD", "nc", "-z", "localhost", "2181"]
|
||||
test: ["CMD", "rabbitmq-diagnostics", "check_running"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
networks:
|
||||
- job-network
|
||||
|
||||
kafka:
|
||||
image: confluentinc/cp-kafka:7.5.0
|
||||
container_name: job-kafka
|
||||
ports:
|
||||
- "9092:9092"
|
||||
- "29092:29092"
|
||||
environment:
|
||||
KAFKA_BROKER_ID: 1
|
||||
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
|
||||
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
|
||||
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
|
||||
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
|
||||
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
||||
KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
|
||||
volumes:
|
||||
- kafka_data:/var/lib/kafka/data
|
||||
depends_on:
|
||||
zookeeper:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "kafka-topics", "--bootstrap-server", "localhost:9092", "--list"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
networks:
|
||||
- job-network
|
||||
|
||||
app:
|
||||
image: job-crawler:latest
|
||||
container_name: job-crawler
|
||||
@@ -57,7 +29,7 @@ services:
|
||||
- ./config:/app/config:ro
|
||||
- app_data:/app/data
|
||||
depends_on:
|
||||
kafka:
|
||||
rabbitmq:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
@@ -68,6 +40,5 @@ networks:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
zookeeper_data:
|
||||
kafka_data:
|
||||
rabbitmq_data:
|
||||
app_data:
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
fastapi==0.109.0
|
||||
uvicorn==0.27.0
|
||||
httpx==0.27.0
|
||||
kafka-python==2.0.2
|
||||
pika==1.3.2
|
||||
apscheduler==3.10.4
|
||||
pydantic==2.5.3
|
||||
python-dotenv==1.0.0
|
||||
PyYAML==6.0.1
|
||||
|
||||
Reference in New Issue
Block a user