feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details
- Create FastAPI application structure with modular organization (api, core, models, services, utils)
- Implement job data crawler service with incremental collection from third-party API
- Add Kafka service integration with Docker Compose configuration for message queue
- Create data models for job listings, progress tracking, and API responses
- Implement REST API endpoints for data consumption (/consume, /status) and task management
- Add progress persistence layer using SQLite for tracking collection offsets
- Implement date filtering logic to extract data published within 7 days
- Create API client service for third-party data source integration
- Add configuration management with environment-based settings
- Include Docker support with Dockerfile and docker-compose.yml for containerized deployment
- Add logging configuration and utility functions for date parsing
- Include requirements.txt with all Python dependencies and README documentation
This commit is contained in:
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
"""数据模型"""
from .job import JobData
from .progress import CrawlProgress, CrawlStatus
from .response import ApiResponse, ConsumeResponse, StatusResponse
__all__ = [
"JobData",
"CrawlProgress",
"CrawlStatus",
"ApiResponse",
"ConsumeResponse",
"StatusResponse"
]

View File

@@ -0,0 +1,60 @@
"""招聘数据模型"""
from pydantic import BaseModel
from datetime import datetime
import uuid
class JobData(BaseModel):
"""招聘数据模型"""
id: str = ""
task_id: str = "" # 任务ID
job_category: str = "" # Std_class - 职位分类
job_title: str = "" # aca112 - 职位名称
company: str = "" # AAB004 - 公司名称
company_type: str = "" # AAB019 - 企业类型
salary: str = "" # acb241 - 薪资范围
location: str = "" # aab302 - 工作地点
address: str = "" # AAE006 - 详细地址
publish_date: str = "" # aae397 - 发布日期
collect_time: str = "" # Collect_time - 采集时间
url: str = "" # ACE760 - 职位链接
description: str = "" # acb22a - 职位描述
experience: str = "" # Experience - 经验要求
education: str = "" # aac011 - 学历要求
headcount: str = "" # acb240 - 招聘人数
industry: str = "" # AAB022 - 行业
company_size: str = "" # Num_employers - 公司规模
contact: str = "" # AAE004 - 联系人
company_intro: str = "" # AAB092 - 公司简介
crawl_time: str = "" # 入库时间
def __init__(self, **data):
super().__init__(**data)
if not self.id:
self.id = str(uuid.uuid4())
if not self.crawl_time:
self.crawl_time = datetime.now().isoformat()
@classmethod
def from_raw(cls, raw: dict) -> "JobData":
"""从原始API数据转换"""
return cls(
job_category=raw.get("Std_class", ""),
job_title=raw.get("aca112", ""),
company=raw.get("AAB004", ""),
company_type=raw.get("AAB019", "").strip(),
salary=raw.get("acb241", ""),
location=raw.get("aab302", ""),
address=raw.get("AAE006", ""),
publish_date=raw.get("aae397", ""),
collect_time=raw.get("Collect_time", ""),
url=raw.get("ACE760", ""),
description=raw.get("acb22a", ""),
experience=raw.get("Experience", ""),
education=raw.get("aac011", ""),
headcount=raw.get("acb240", ""),
industry=raw.get("AAB022", ""),
company_size=raw.get("Num_employers", ""),
contact=raw.get("AAE004", ""),
company_intro=raw.get("AAB092", ""),
)

View File

@@ -0,0 +1,24 @@
"""采集进度模型"""
from pydantic import BaseModel
class CrawlProgress(BaseModel):
"""采集进度"""
task_id: str
current_offset: int = 0
total: int = 0
last_update: str = ""
status: str = "idle" # idle, running, completed, error
class CrawlStatus(BaseModel):
"""采集状态响应"""
task_id: str
total: int
current_offset: int
progress: str
kafka_lag: int = 0
status: str
last_update: str
filtered_count: int = 0
produced_count: int = 0

View File

@@ -0,0 +1,23 @@
"""API响应模型"""
from pydantic import BaseModel
from typing import Optional, Any
class ApiResponse(BaseModel):
"""通用API响应"""
code: int = 0
message: str = "success"
data: Optional[Any] = None
class ConsumeResponse(BaseModel):
"""消费响应"""
code: int = 0
data: list = []
count: int = 0
class StatusResponse(BaseModel):
"""状态响应"""
code: int = 0
data: dict = {}