feat(job_crawler): initialize job crawler service with kafka integration
- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
This commit is contained in:
12
job_crawler/app/services/__init__.py
Normal file
12
job_crawler/app/services/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""服务模块"""
|
||||
from .api_client import api_client, BazhuayuClient
|
||||
from .kafka_service import kafka_service, KafkaService
|
||||
from .progress_store import progress_store, ProgressStore
|
||||
from .crawler import crawler_manager, CrawlerManager, TaskCrawler
|
||||
|
||||
__all__ = [
|
||||
"api_client", "BazhuayuClient",
|
||||
"kafka_service", "KafkaService",
|
||||
"progress_store", "ProgressStore",
|
||||
"crawler_manager", "CrawlerManager", "TaskCrawler"
|
||||
]
|
||||
91
job_crawler/app/services/api_client.py
Normal file
91
job_crawler/app/services/api_client.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""八爪鱼API客户端"""
|
||||
import httpx
|
||||
import time
|
||||
import logging
|
||||
from typing import Optional, Dict, Any
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BazhuayuClient:
|
||||
"""八爪鱼API客户端"""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = settings.api.base_url
|
||||
self.username = settings.api.username
|
||||
self.password = settings.api.password
|
||||
self._access_token: Optional[str] = None
|
||||
self._token_expires_at: float = 0
|
||||
|
||||
async def _get_token(self) -> str:
|
||||
"""获取访问令牌"""
|
||||
if self._access_token and time.time() < self._token_expires_at - 300:
|
||||
return self._access_token
|
||||
|
||||
logger.info("正在获取新的access_token...")
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.base_url}/token",
|
||||
json={
|
||||
"username": self.username,
|
||||
"password": self.password,
|
||||
"grant_type": "password"
|
||||
},
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "*/*"
|
||||
},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(f"获取token失败: {response.status_code} - {response.text}")
|
||||
|
||||
data = response.json()
|
||||
token_data = data.get("data", {})
|
||||
|
||||
self._access_token = token_data.get("access_token")
|
||||
expires_in = int(token_data.get("expires_in", 86400))
|
||||
self._token_expires_at = time.time() + expires_in
|
||||
|
||||
logger.info(f"获取token成功,有效期: {expires_in}秒")
|
||||
return self._access_token
|
||||
|
||||
async def fetch_data(self, task_id: str, offset: int, size: int = 100) -> Dict[str, Any]:
|
||||
"""获取任务数据"""
|
||||
token = await self._get_token()
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(
|
||||
f"{self.base_url}/data/all",
|
||||
params={
|
||||
"taskId": task_id,
|
||||
"offset": offset,
|
||||
"size": size
|
||||
},
|
||||
headers={
|
||||
"Authorization": f"Bearer {token}",
|
||||
"Accept": "*/*"
|
||||
},
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code == 401:
|
||||
self._access_token = None
|
||||
self._token_expires_at = 0
|
||||
return await self.fetch_data(task_id, offset, size)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(f"获取数据失败: {response.status_code} - {response.text}")
|
||||
|
||||
return response.json()
|
||||
|
||||
async def get_total_count(self, task_id: str) -> int:
|
||||
"""获取数据总数"""
|
||||
result = await self.fetch_data(task_id, 0, 1)
|
||||
return result.get("data", {}).get("total", 0)
|
||||
|
||||
|
||||
api_client = BazhuayuClient()
|
||||
209
job_crawler/app/services/crawler.py
Normal file
209
job_crawler/app/services/crawler.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""多任务增量采集核心逻辑"""
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from app.services.api_client import api_client
|
||||
from app.services.kafka_service import kafka_service
|
||||
from app.services.progress_store import progress_store
|
||||
from app.utils import is_within_days
|
||||
from app.models import JobData
|
||||
from app.core.config import settings, TaskConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TaskCrawler:
|
||||
"""单个任务采集器"""
|
||||
|
||||
def __init__(self, task_config: TaskConfig):
|
||||
self.task_id = task_config.id
|
||||
self.task_name = task_config.name or task_config.id
|
||||
self.batch_size = settings.api.batch_size
|
||||
self.filter_days = settings.crawler.filter_days
|
||||
self._running = False
|
||||
self._total_filtered = 0
|
||||
self._total_produced = 0
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
return self._running
|
||||
|
||||
async def start(self, reset: bool = False):
|
||||
"""开始采集"""
|
||||
if self._running:
|
||||
logger.warning(f"[{self.task_name}] 任务已在运行中")
|
||||
return
|
||||
|
||||
self._running = True
|
||||
self._total_filtered = 0
|
||||
self._total_produced = 0
|
||||
logger.info(f"[{self.task_name}] 开始采集任务")
|
||||
|
||||
try:
|
||||
if reset:
|
||||
progress_store.reset_progress(self.task_id)
|
||||
current_offset = 0
|
||||
else:
|
||||
progress = progress_store.get_progress(self.task_id)
|
||||
current_offset = progress.current_offset if progress else 0
|
||||
|
||||
total = await api_client.get_total_count(self.task_id)
|
||||
logger.info(f"[{self.task_name}] 数据总数: {total}, 当前偏移: {current_offset}")
|
||||
|
||||
if current_offset >= total:
|
||||
logger.info(f"[{self.task_name}] 数据已全部采集完成")
|
||||
progress_store.save_progress(self.task_id, current_offset, total, "completed",
|
||||
self._total_filtered, self._total_produced)
|
||||
self._running = False
|
||||
return
|
||||
|
||||
while current_offset < total and self._running:
|
||||
try:
|
||||
await self._crawl_batch(current_offset)
|
||||
current_offset += self.batch_size
|
||||
progress_store.save_progress(self.task_id, current_offset, total, "running",
|
||||
self._total_filtered, self._total_produced)
|
||||
progress_pct = min(100, current_offset / total * 100)
|
||||
logger.info(f"[{self.task_name}] 进度: {progress_pct:.2f}% ({current_offset}/{total})")
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception as e:
|
||||
logger.error(f"[{self.task_name}] 采集批次失败: {e}")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
status = "completed" if current_offset >= total else "stopped"
|
||||
progress_store.save_progress(self.task_id, current_offset, total, status,
|
||||
self._total_filtered, self._total_produced)
|
||||
logger.info(f"[{self.task_name}] 采集任务 {status}")
|
||||
except Exception as e:
|
||||
logger.error(f"[{self.task_name}] 采集任务异常: {e}")
|
||||
progress_store.save_progress(self.task_id, 0, 0, "error",
|
||||
self._total_filtered, self._total_produced)
|
||||
finally:
|
||||
self._running = False
|
||||
|
||||
async def _crawl_batch(self, offset: int):
|
||||
"""采集一批数据"""
|
||||
result = await api_client.fetch_data(self.task_id, offset, self.batch_size)
|
||||
data_list = result.get("data", {}).get("data", [])
|
||||
if not data_list:
|
||||
return
|
||||
|
||||
filtered_jobs = []
|
||||
for raw in data_list:
|
||||
aae397 = raw.get("aae397", "")
|
||||
collect_time = raw.get("Collect_time", "")
|
||||
if is_within_days(aae397, collect_time, self.filter_days):
|
||||
job = JobData.from_raw(raw)
|
||||
job.task_id = self.task_id # 添加任务ID标识
|
||||
filtered_jobs.append(job)
|
||||
|
||||
self._total_filtered += len(filtered_jobs)
|
||||
if filtered_jobs:
|
||||
produced = kafka_service.produce_batch(filtered_jobs)
|
||||
self._total_produced += produced
|
||||
|
||||
def stop(self):
|
||||
"""停止采集"""
|
||||
logger.info(f"[{self.task_name}] 正在停止采集任务...")
|
||||
self._running = False
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""获取采集状态"""
|
||||
stats = progress_store.get_stats(self.task_id)
|
||||
if not stats:
|
||||
return {
|
||||
"task_id": self.task_id, "task_name": self.task_name,
|
||||
"total": 0, "current_offset": 0, "progress": "0%",
|
||||
"status": "idle", "last_update": "",
|
||||
"filtered_count": 0, "produced_count": 0
|
||||
}
|
||||
total = stats.get("total", 0)
|
||||
current = stats.get("current_offset", 0)
|
||||
progress = f"{min(100, current / total * 100):.2f}%" if total > 0 else "0%"
|
||||
return {
|
||||
"task_id": self.task_id, "task_name": self.task_name,
|
||||
"total": total, "current_offset": current, "progress": progress,
|
||||
"status": stats.get("status", "idle"), "last_update": stats.get("last_update", ""),
|
||||
"filtered_count": stats.get("filtered_count", 0),
|
||||
"produced_count": stats.get("produced_count", 0)
|
||||
}
|
||||
|
||||
|
||||
class CrawlerManager:
|
||||
"""多任务采集管理器"""
|
||||
|
||||
def __init__(self):
|
||||
self._crawlers: Dict[str, TaskCrawler] = {}
|
||||
self._executor = ThreadPoolExecutor(max_workers=settings.crawler.max_workers)
|
||||
self._init_crawlers()
|
||||
|
||||
def _init_crawlers(self):
|
||||
"""初始化所有启用的任务采集器"""
|
||||
for task in settings.get_enabled_tasks():
|
||||
self._crawlers[task.id] = TaskCrawler(task)
|
||||
logger.info(f"初始化任务采集器: {task.name} ({task.id})")
|
||||
|
||||
def get_crawler(self, task_id: str) -> Optional[TaskCrawler]:
|
||||
"""获取指定任务的采集器"""
|
||||
return self._crawlers.get(task_id)
|
||||
|
||||
def get_all_crawlers(self) -> Dict[str, TaskCrawler]:
|
||||
"""获取所有采集器"""
|
||||
return self._crawlers
|
||||
|
||||
async def start_task(self, task_id: str, reset: bool = False) -> bool:
|
||||
"""启动单个任务"""
|
||||
crawler = self._crawlers.get(task_id)
|
||||
if not crawler:
|
||||
logger.error(f"任务不存在: {task_id}")
|
||||
return False
|
||||
if crawler.is_running:
|
||||
logger.warning(f"任务已在运行: {task_id}")
|
||||
return False
|
||||
asyncio.create_task(crawler.start(reset))
|
||||
return True
|
||||
|
||||
async def start_all(self, reset: bool = False):
|
||||
"""启动所有任务"""
|
||||
tasks = []
|
||||
for task_id, crawler in self._crawlers.items():
|
||||
if not crawler.is_running:
|
||||
tasks.append(crawler.start(reset))
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
def stop_task(self, task_id: str) -> bool:
|
||||
"""停止单个任务"""
|
||||
crawler = self._crawlers.get(task_id)
|
||||
if not crawler:
|
||||
return False
|
||||
crawler.stop()
|
||||
return True
|
||||
|
||||
def stop_all(self):
|
||||
"""停止所有任务"""
|
||||
for crawler in self._crawlers.values():
|
||||
crawler.stop()
|
||||
|
||||
def get_status(self, task_id: str = None) -> dict:
|
||||
"""获取状态"""
|
||||
if task_id:
|
||||
crawler = self._crawlers.get(task_id)
|
||||
return crawler.get_status() if crawler else {}
|
||||
|
||||
# 返回所有任务状态
|
||||
return {
|
||||
"tasks": [c.get_status() for c in self._crawlers.values()],
|
||||
"kafka_lag": kafka_service.get_lag(),
|
||||
"running_count": sum(1 for c in self._crawlers.values() if c.is_running)
|
||||
}
|
||||
|
||||
@property
|
||||
def is_any_running(self) -> bool:
|
||||
"""是否有任务在运行"""
|
||||
return any(c.is_running for c in self._crawlers.values())
|
||||
|
||||
|
||||
# 全局管理器实例
|
||||
crawler_manager = CrawlerManager()
|
||||
138
job_crawler/app/services/kafka_service.py
Normal file
138
job_crawler/app/services/kafka_service.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""Kafka服务"""
|
||||
import json
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from kafka import KafkaProducer, KafkaConsumer
|
||||
from kafka.errors import KafkaError
|
||||
from kafka.admin import KafkaAdminClient, NewTopic
|
||||
from app.models import JobData
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class KafkaService:
|
||||
"""Kafka生产者/消费者服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.bootstrap_servers = settings.kafka.bootstrap_servers
|
||||
self.topic = settings.kafka.topic
|
||||
self.consumer_group = settings.kafka.consumer_group
|
||||
self._producer: Optional[KafkaProducer] = None
|
||||
self._ensure_topic()
|
||||
|
||||
def _ensure_topic(self):
|
||||
"""确保Topic存在"""
|
||||
try:
|
||||
admin = KafkaAdminClient(
|
||||
bootstrap_servers=self.bootstrap_servers,
|
||||
client_id="job_crawler_admin"
|
||||
)
|
||||
existing_topics = admin.list_topics()
|
||||
|
||||
if self.topic not in existing_topics:
|
||||
topic = NewTopic(name=self.topic, num_partitions=3, replication_factor=1)
|
||||
admin.create_topics([topic])
|
||||
logger.info(f"创建Topic: {self.topic}")
|
||||
admin.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"检查/创建Topic失败: {e}")
|
||||
|
||||
@property
|
||||
def producer(self) -> KafkaProducer:
|
||||
"""获取生产者实例"""
|
||||
if self._producer is None:
|
||||
self._producer = KafkaProducer(
|
||||
bootstrap_servers=self.bootstrap_servers,
|
||||
value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
|
||||
key_serializer=lambda k: k.encode('utf-8') if k else None,
|
||||
acks='all',
|
||||
retries=3
|
||||
)
|
||||
return self._producer
|
||||
|
||||
def get_consumer(self, auto_offset_reset: str = 'earliest') -> KafkaConsumer:
|
||||
"""获取消费者实例"""
|
||||
return KafkaConsumer(
|
||||
self.topic,
|
||||
bootstrap_servers=self.bootstrap_servers,
|
||||
group_id=self.consumer_group,
|
||||
auto_offset_reset=auto_offset_reset,
|
||||
enable_auto_commit=True,
|
||||
value_deserializer=lambda m: json.loads(m.decode('utf-8')),
|
||||
consumer_timeout_ms=5000
|
||||
)
|
||||
|
||||
def produce(self, job_data: JobData) -> bool:
|
||||
"""发送消息到Kafka"""
|
||||
try:
|
||||
future = self.producer.send(self.topic, key=job_data.id, value=job_data.model_dump())
|
||||
future.get(timeout=10)
|
||||
return True
|
||||
except KafkaError as e:
|
||||
logger.error(f"发送消息失败: {e}")
|
||||
return False
|
||||
|
||||
def produce_batch(self, job_list: List[JobData]) -> int:
|
||||
"""批量发送消息"""
|
||||
success_count = 0
|
||||
for job in job_list:
|
||||
if self.produce(job):
|
||||
success_count += 1
|
||||
self.producer.flush()
|
||||
return success_count
|
||||
|
||||
def consume(self, batch_size: int = 10, timeout_ms: int = 5000) -> List[dict]:
|
||||
"""消费消息"""
|
||||
messages = []
|
||||
consumer = KafkaConsumer(
|
||||
self.topic,
|
||||
bootstrap_servers=self.bootstrap_servers,
|
||||
group_id=self.consumer_group,
|
||||
auto_offset_reset='earliest',
|
||||
enable_auto_commit=True,
|
||||
value_deserializer=lambda m: json.loads(m.decode('utf-8')),
|
||||
consumer_timeout_ms=timeout_ms,
|
||||
max_poll_records=batch_size
|
||||
)
|
||||
try:
|
||||
for message in consumer:
|
||||
messages.append(message.value)
|
||||
if len(messages) >= batch_size:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"消费超时或完成: {e}")
|
||||
finally:
|
||||
consumer.close()
|
||||
return messages
|
||||
|
||||
def get_lag(self) -> int:
|
||||
"""获取消息堆积量"""
|
||||
try:
|
||||
consumer = KafkaConsumer(bootstrap_servers=self.bootstrap_servers, group_id=self.consumer_group)
|
||||
partitions = consumer.partitions_for_topic(self.topic)
|
||||
if not partitions:
|
||||
consumer.close()
|
||||
return 0
|
||||
from kafka import TopicPartition
|
||||
tps = [TopicPartition(self.topic, p) for p in partitions]
|
||||
end_offsets = consumer.end_offsets(tps)
|
||||
total_lag = 0
|
||||
for tp in tps:
|
||||
committed = consumer.committed(tp)
|
||||
end = end_offsets.get(tp, 0)
|
||||
total_lag += max(0, end - (committed or 0))
|
||||
consumer.close()
|
||||
return total_lag
|
||||
except Exception as e:
|
||||
logger.warning(f"获取lag失败: {e}")
|
||||
return 0
|
||||
|
||||
def close(self):
|
||||
"""关闭连接"""
|
||||
if self._producer:
|
||||
self._producer.close()
|
||||
self._producer = None
|
||||
|
||||
|
||||
kafka_service = KafkaService()
|
||||
95
job_crawler/app/services/progress_store.py
Normal file
95
job_crawler/app/services/progress_store.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""采集进度存储"""
|
||||
import sqlite3
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from contextlib import contextmanager
|
||||
from app.models import CrawlProgress
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProgressStore:
|
||||
"""采集进度存储(SQLite)"""
|
||||
|
||||
def __init__(self, db_path: str = None):
|
||||
self.db_path = db_path or settings.database.path
|
||||
os.makedirs(os.path.dirname(self.db_path) or ".", exist_ok=True)
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self):
|
||||
"""初始化数据库"""
|
||||
with self._get_conn() as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS crawl_progress (
|
||||
task_id TEXT PRIMARY KEY,
|
||||
current_offset INTEGER DEFAULT 0,
|
||||
total INTEGER DEFAULT 0,
|
||||
last_update TEXT,
|
||||
status TEXT DEFAULT 'idle',
|
||||
filtered_count INTEGER DEFAULT 0,
|
||||
produced_count INTEGER DEFAULT 0
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
@contextmanager
|
||||
def _get_conn(self):
|
||||
"""获取数据库连接"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_progress(self, task_id: str) -> Optional[CrawlProgress]:
|
||||
"""获取采集进度"""
|
||||
with self._get_conn() as conn:
|
||||
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return CrawlProgress(
|
||||
task_id=row["task_id"],
|
||||
current_offset=row["current_offset"],
|
||||
total=row["total"],
|
||||
last_update=row["last_update"] or "",
|
||||
status=row["status"]
|
||||
)
|
||||
return None
|
||||
|
||||
def save_progress(self, task_id: str, offset: int, total: int,
|
||||
status: str = "running", filtered_count: int = 0, produced_count: int = 0):
|
||||
"""保存采集进度"""
|
||||
now = datetime.now().isoformat()
|
||||
with self._get_conn() as conn:
|
||||
conn.execute("""
|
||||
INSERT INTO crawl_progress
|
||||
(task_id, current_offset, total, last_update, status, filtered_count, produced_count)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(task_id) DO UPDATE SET
|
||||
current_offset = excluded.current_offset, total = excluded.total,
|
||||
last_update = excluded.last_update, status = excluded.status,
|
||||
filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
|
||||
""", (task_id, offset, total, now, status, filtered_count, produced_count))
|
||||
conn.commit()
|
||||
|
||||
def get_stats(self, task_id: str) -> dict:
|
||||
"""获取统计信息"""
|
||||
with self._get_conn() as conn:
|
||||
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return dict(row)
|
||||
return {}
|
||||
|
||||
def reset_progress(self, task_id: str):
|
||||
"""重置采集进度"""
|
||||
with self._get_conn() as conn:
|
||||
conn.execute("DELETE FROM crawl_progress WHERE task_id = ?", (task_id,))
|
||||
conn.commit()
|
||||
|
||||
|
||||
progress_store = ProgressStore()
|
||||
Reference in New Issue
Block a user