feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details
- Create FastAPI application structure with modular organization (api, core, models, services, utils)
- Implement job data crawler service with incremental collection from third-party API
- Add Kafka service integration with Docker Compose configuration for message queue
- Create data models for job listings, progress tracking, and API responses
- Implement REST API endpoints for data consumption (/consume, /status) and task management
- Add progress persistence layer using SQLite for tracking collection offsets
- Implement date filtering logic to extract data published within 7 days
- Create API client service for third-party data source integration
- Add configuration management with environment-based settings
- Include Docker support with Dockerfile and docker-compose.yml for containerized deployment
- Add logging configuration and utility functions for date parsing
- Include requirements.txt with all Python dependencies and README documentation
This commit is contained in:
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
"""服务模块"""
from .api_client import api_client, BazhuayuClient
from .kafka_service import kafka_service, KafkaService
from .progress_store import progress_store, ProgressStore
from .crawler import crawler_manager, CrawlerManager, TaskCrawler
__all__ = [
"api_client", "BazhuayuClient",
"kafka_service", "KafkaService",
"progress_store", "ProgressStore",
"crawler_manager", "CrawlerManager", "TaskCrawler"
]

View File

@@ -0,0 +1,91 @@
"""八爪鱼API客户端"""
import httpx
import time
import logging
from typing import Optional, Dict, Any
from app.core.config import settings
logger = logging.getLogger(__name__)
class BazhuayuClient:
"""八爪鱼API客户端"""
def __init__(self):
self.base_url = settings.api.base_url
self.username = settings.api.username
self.password = settings.api.password
self._access_token: Optional[str] = None
self._token_expires_at: float = 0
async def _get_token(self) -> str:
"""获取访问令牌"""
if self._access_token and time.time() < self._token_expires_at - 300:
return self._access_token
logger.info("正在获取新的access_token...")
async with httpx.AsyncClient() as client:
response = await client.post(
f"{self.base_url}/token",
json={
"username": self.username,
"password": self.password,
"grant_type": "password"
},
headers={
"Content-Type": "application/json",
"Accept": "*/*"
},
timeout=30
)
if response.status_code != 200:
raise Exception(f"获取token失败: {response.status_code} - {response.text}")
data = response.json()
token_data = data.get("data", {})
self._access_token = token_data.get("access_token")
expires_in = int(token_data.get("expires_in", 86400))
self._token_expires_at = time.time() + expires_in
logger.info(f"获取token成功有效期: {expires_in}")
return self._access_token
async def fetch_data(self, task_id: str, offset: int, size: int = 100) -> Dict[str, Any]:
"""获取任务数据"""
token = await self._get_token()
async with httpx.AsyncClient() as client:
response = await client.get(
f"{self.base_url}/data/all",
params={
"taskId": task_id,
"offset": offset,
"size": size
},
headers={
"Authorization": f"Bearer {token}",
"Accept": "*/*"
},
timeout=60
)
if response.status_code == 401:
self._access_token = None
self._token_expires_at = 0
return await self.fetch_data(task_id, offset, size)
if response.status_code != 200:
raise Exception(f"获取数据失败: {response.status_code} - {response.text}")
return response.json()
async def get_total_count(self, task_id: str) -> int:
"""获取数据总数"""
result = await self.fetch_data(task_id, 0, 1)
return result.get("data", {}).get("total", 0)
api_client = BazhuayuClient()

View File

@@ -0,0 +1,209 @@
"""多任务增量采集核心逻辑"""
import asyncio
import logging
from typing import Dict, Optional
from concurrent.futures import ThreadPoolExecutor
from app.services.api_client import api_client
from app.services.kafka_service import kafka_service
from app.services.progress_store import progress_store
from app.utils import is_within_days
from app.models import JobData
from app.core.config import settings, TaskConfig
logger = logging.getLogger(__name__)
class TaskCrawler:
"""单个任务采集器"""
def __init__(self, task_config: TaskConfig):
self.task_id = task_config.id
self.task_name = task_config.name or task_config.id
self.batch_size = settings.api.batch_size
self.filter_days = settings.crawler.filter_days
self._running = False
self._total_filtered = 0
self._total_produced = 0
@property
def is_running(self) -> bool:
return self._running
async def start(self, reset: bool = False):
"""开始采集"""
if self._running:
logger.warning(f"[{self.task_name}] 任务已在运行中")
return
self._running = True
self._total_filtered = 0
self._total_produced = 0
logger.info(f"[{self.task_name}] 开始采集任务")
try:
if reset:
progress_store.reset_progress(self.task_id)
current_offset = 0
else:
progress = progress_store.get_progress(self.task_id)
current_offset = progress.current_offset if progress else 0
total = await api_client.get_total_count(self.task_id)
logger.info(f"[{self.task_name}] 数据总数: {total}, 当前偏移: {current_offset}")
if current_offset >= total:
logger.info(f"[{self.task_name}] 数据已全部采集完成")
progress_store.save_progress(self.task_id, current_offset, total, "completed",
self._total_filtered, self._total_produced)
self._running = False
return
while current_offset < total and self._running:
try:
await self._crawl_batch(current_offset)
current_offset += self.batch_size
progress_store.save_progress(self.task_id, current_offset, total, "running",
self._total_filtered, self._total_produced)
progress_pct = min(100, current_offset / total * 100)
logger.info(f"[{self.task_name}] 进度: {progress_pct:.2f}% ({current_offset}/{total})")
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"[{self.task_name}] 采集批次失败: {e}")
await asyncio.sleep(5)
status = "completed" if current_offset >= total else "stopped"
progress_store.save_progress(self.task_id, current_offset, total, status,
self._total_filtered, self._total_produced)
logger.info(f"[{self.task_name}] 采集任务 {status}")
except Exception as e:
logger.error(f"[{self.task_name}] 采集任务异常: {e}")
progress_store.save_progress(self.task_id, 0, 0, "error",
self._total_filtered, self._total_produced)
finally:
self._running = False
async def _crawl_batch(self, offset: int):
"""采集一批数据"""
result = await api_client.fetch_data(self.task_id, offset, self.batch_size)
data_list = result.get("data", {}).get("data", [])
if not data_list:
return
filtered_jobs = []
for raw in data_list:
aae397 = raw.get("aae397", "")
collect_time = raw.get("Collect_time", "")
if is_within_days(aae397, collect_time, self.filter_days):
job = JobData.from_raw(raw)
job.task_id = self.task_id # 添加任务ID标识
filtered_jobs.append(job)
self._total_filtered += len(filtered_jobs)
if filtered_jobs:
produced = kafka_service.produce_batch(filtered_jobs)
self._total_produced += produced
def stop(self):
"""停止采集"""
logger.info(f"[{self.task_name}] 正在停止采集任务...")
self._running = False
def get_status(self) -> dict:
"""获取采集状态"""
stats = progress_store.get_stats(self.task_id)
if not stats:
return {
"task_id": self.task_id, "task_name": self.task_name,
"total": 0, "current_offset": 0, "progress": "0%",
"status": "idle", "last_update": "",
"filtered_count": 0, "produced_count": 0
}
total = stats.get("total", 0)
current = stats.get("current_offset", 0)
progress = f"{min(100, current / total * 100):.2f}%" if total > 0 else "0%"
return {
"task_id": self.task_id, "task_name": self.task_name,
"total": total, "current_offset": current, "progress": progress,
"status": stats.get("status", "idle"), "last_update": stats.get("last_update", ""),
"filtered_count": stats.get("filtered_count", 0),
"produced_count": stats.get("produced_count", 0)
}
class CrawlerManager:
"""多任务采集管理器"""
def __init__(self):
self._crawlers: Dict[str, TaskCrawler] = {}
self._executor = ThreadPoolExecutor(max_workers=settings.crawler.max_workers)
self._init_crawlers()
def _init_crawlers(self):
"""初始化所有启用的任务采集器"""
for task in settings.get_enabled_tasks():
self._crawlers[task.id] = TaskCrawler(task)
logger.info(f"初始化任务采集器: {task.name} ({task.id})")
def get_crawler(self, task_id: str) -> Optional[TaskCrawler]:
"""获取指定任务的采集器"""
return self._crawlers.get(task_id)
def get_all_crawlers(self) -> Dict[str, TaskCrawler]:
"""获取所有采集器"""
return self._crawlers
async def start_task(self, task_id: str, reset: bool = False) -> bool:
"""启动单个任务"""
crawler = self._crawlers.get(task_id)
if not crawler:
logger.error(f"任务不存在: {task_id}")
return False
if crawler.is_running:
logger.warning(f"任务已在运行: {task_id}")
return False
asyncio.create_task(crawler.start(reset))
return True
async def start_all(self, reset: bool = False):
"""启动所有任务"""
tasks = []
for task_id, crawler in self._crawlers.items():
if not crawler.is_running:
tasks.append(crawler.start(reset))
if tasks:
await asyncio.gather(*tasks)
def stop_task(self, task_id: str) -> bool:
"""停止单个任务"""
crawler = self._crawlers.get(task_id)
if not crawler:
return False
crawler.stop()
return True
def stop_all(self):
"""停止所有任务"""
for crawler in self._crawlers.values():
crawler.stop()
def get_status(self, task_id: str = None) -> dict:
"""获取状态"""
if task_id:
crawler = self._crawlers.get(task_id)
return crawler.get_status() if crawler else {}
# 返回所有任务状态
return {
"tasks": [c.get_status() for c in self._crawlers.values()],
"kafka_lag": kafka_service.get_lag(),
"running_count": sum(1 for c in self._crawlers.values() if c.is_running)
}
@property
def is_any_running(self) -> bool:
"""是否有任务在运行"""
return any(c.is_running for c in self._crawlers.values())
# 全局管理器实例
crawler_manager = CrawlerManager()

View File

@@ -0,0 +1,138 @@
"""Kafka服务"""
import json
import logging
from typing import List, Optional
from kafka import KafkaProducer, KafkaConsumer
from kafka.errors import KafkaError
from kafka.admin import KafkaAdminClient, NewTopic
from app.models import JobData
from app.core.config import settings
logger = logging.getLogger(__name__)
class KafkaService:
"""Kafka生产者/消费者服务"""
def __init__(self):
self.bootstrap_servers = settings.kafka.bootstrap_servers
self.topic = settings.kafka.topic
self.consumer_group = settings.kafka.consumer_group
self._producer: Optional[KafkaProducer] = None
self._ensure_topic()
def _ensure_topic(self):
"""确保Topic存在"""
try:
admin = KafkaAdminClient(
bootstrap_servers=self.bootstrap_servers,
client_id="job_crawler_admin"
)
existing_topics = admin.list_topics()
if self.topic not in existing_topics:
topic = NewTopic(name=self.topic, num_partitions=3, replication_factor=1)
admin.create_topics([topic])
logger.info(f"创建Topic: {self.topic}")
admin.close()
except Exception as e:
logger.warning(f"检查/创建Topic失败: {e}")
@property
def producer(self) -> KafkaProducer:
"""获取生产者实例"""
if self._producer is None:
self._producer = KafkaProducer(
bootstrap_servers=self.bootstrap_servers,
value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
key_serializer=lambda k: k.encode('utf-8') if k else None,
acks='all',
retries=3
)
return self._producer
def get_consumer(self, auto_offset_reset: str = 'earliest') -> KafkaConsumer:
"""获取消费者实例"""
return KafkaConsumer(
self.topic,
bootstrap_servers=self.bootstrap_servers,
group_id=self.consumer_group,
auto_offset_reset=auto_offset_reset,
enable_auto_commit=True,
value_deserializer=lambda m: json.loads(m.decode('utf-8')),
consumer_timeout_ms=5000
)
def produce(self, job_data: JobData) -> bool:
"""发送消息到Kafka"""
try:
future = self.producer.send(self.topic, key=job_data.id, value=job_data.model_dump())
future.get(timeout=10)
return True
except KafkaError as e:
logger.error(f"发送消息失败: {e}")
return False
def produce_batch(self, job_list: List[JobData]) -> int:
"""批量发送消息"""
success_count = 0
for job in job_list:
if self.produce(job):
success_count += 1
self.producer.flush()
return success_count
def consume(self, batch_size: int = 10, timeout_ms: int = 5000) -> List[dict]:
"""消费消息"""
messages = []
consumer = KafkaConsumer(
self.topic,
bootstrap_servers=self.bootstrap_servers,
group_id=self.consumer_group,
auto_offset_reset='earliest',
enable_auto_commit=True,
value_deserializer=lambda m: json.loads(m.decode('utf-8')),
consumer_timeout_ms=timeout_ms,
max_poll_records=batch_size
)
try:
for message in consumer:
messages.append(message.value)
if len(messages) >= batch_size:
break
except Exception as e:
logger.debug(f"消费超时或完成: {e}")
finally:
consumer.close()
return messages
def get_lag(self) -> int:
"""获取消息堆积量"""
try:
consumer = KafkaConsumer(bootstrap_servers=self.bootstrap_servers, group_id=self.consumer_group)
partitions = consumer.partitions_for_topic(self.topic)
if not partitions:
consumer.close()
return 0
from kafka import TopicPartition
tps = [TopicPartition(self.topic, p) for p in partitions]
end_offsets = consumer.end_offsets(tps)
total_lag = 0
for tp in tps:
committed = consumer.committed(tp)
end = end_offsets.get(tp, 0)
total_lag += max(0, end - (committed or 0))
consumer.close()
return total_lag
except Exception as e:
logger.warning(f"获取lag失败: {e}")
return 0
def close(self):
"""关闭连接"""
if self._producer:
self._producer.close()
self._producer = None
kafka_service = KafkaService()

View File

@@ -0,0 +1,95 @@
"""采集进度存储"""
import sqlite3
import os
import logging
from datetime import datetime
from typing import Optional
from contextlib import contextmanager
from app.models import CrawlProgress
from app.core.config import settings
logger = logging.getLogger(__name__)
class ProgressStore:
"""采集进度存储SQLite"""
def __init__(self, db_path: str = None):
self.db_path = db_path or settings.database.path
os.makedirs(os.path.dirname(self.db_path) or ".", exist_ok=True)
self._init_db()
def _init_db(self):
"""初始化数据库"""
with self._get_conn() as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS crawl_progress (
task_id TEXT PRIMARY KEY,
current_offset INTEGER DEFAULT 0,
total INTEGER DEFAULT 0,
last_update TEXT,
status TEXT DEFAULT 'idle',
filtered_count INTEGER DEFAULT 0,
produced_count INTEGER DEFAULT 0
)
""")
conn.commit()
@contextmanager
def _get_conn(self):
"""获取数据库连接"""
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def get_progress(self, task_id: str) -> Optional[CrawlProgress]:
"""获取采集进度"""
with self._get_conn() as conn:
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
row = cursor.fetchone()
if row:
return CrawlProgress(
task_id=row["task_id"],
current_offset=row["current_offset"],
total=row["total"],
last_update=row["last_update"] or "",
status=row["status"]
)
return None
def save_progress(self, task_id: str, offset: int, total: int,
status: str = "running", filtered_count: int = 0, produced_count: int = 0):
"""保存采集进度"""
now = datetime.now().isoformat()
with self._get_conn() as conn:
conn.execute("""
INSERT INTO crawl_progress
(task_id, current_offset, total, last_update, status, filtered_count, produced_count)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(task_id) DO UPDATE SET
current_offset = excluded.current_offset, total = excluded.total,
last_update = excluded.last_update, status = excluded.status,
filtered_count = excluded.filtered_count, produced_count = excluded.produced_count
""", (task_id, offset, total, now, status, filtered_count, produced_count))
conn.commit()
def get_stats(self, task_id: str) -> dict:
"""获取统计信息"""
with self._get_conn() as conn:
cursor = conn.execute("SELECT * FROM crawl_progress WHERE task_id = ?", (task_id,))
row = cursor.fetchone()
if row:
return dict(row)
return {}
def reset_progress(self, task_id: str):
"""重置采集进度"""
with self._get_conn() as conn:
conn.execute("DELETE FROM crawl_progress WHERE task_id = ?", (task_id,))
conn.commit()
progress_store = ProgressStore()