feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions
--- a/job_crawler/app/models/init.py
+++ b/job_crawler/app/models/init.py
@@ -0,0 +1,13 @@
+"""数据模型"""
+from .job import JobData
+from .progress import CrawlProgress, CrawlStatus
+from .response import ApiResponse, ConsumeResponse, StatusResponse
+
+__all__ = [
+    "JobData",
+    "CrawlProgress", 
+    "CrawlStatus",
+    "ApiResponse",
+    "ConsumeResponse",
+    "StatusResponse"
+]
--- a/job_crawler/app/models/job.py
+++ b/job_crawler/app/models/job.py
@@ -0,0 +1,60 @@
+"""招聘数据模型"""
+from pydantic import BaseModel
+from datetime import datetime
+import uuid
+
+
+class JobData(BaseModel):
+    """招聘数据模型"""
+    id: str = ""
+    task_id: str = ""               # 任务ID
+    job_category: str = ""          # Std_class - 职位分类
+    job_title: str = ""             # aca112 - 职位名称
+    company: str = ""               # AAB004 - 公司名称
+    company_type: str = ""          # AAB019 - 企业类型
+    salary: str = ""                # acb241 - 薪资范围
+    location: str = ""              # aab302 - 工作地点
+    address: str = ""               # AAE006 - 详细地址
+    publish_date: str = ""          # aae397 - 发布日期
+    collect_time: str = ""          # Collect_time - 采集时间
+    url: str = ""                   # ACE760 - 职位链接
+    description: str = ""           # acb22a - 职位描述
+    experience: str = ""            # Experience - 经验要求
+    education: str = ""             # aac011 - 学历要求
+    headcount: str = ""             # acb240 - 招聘人数
+    industry: str = ""              # AAB022 - 行业
+    company_size: str = ""          # Num_employers - 公司规模
+    contact: str = ""               # AAE004 - 联系人
+    company_intro: str = ""         # AAB092 - 公司简介
+    crawl_time: str = ""            # 入库时间
+    
+    def __init__(self, **data):
+        super().__init__(**data)
+        if not self.id:
+            self.id = str(uuid.uuid4())
+        if not self.crawl_time:
+            self.crawl_time = datetime.now().isoformat()
+    
+    @classmethod
+    def from_raw(cls, raw: dict) -> "JobData":
+        """从原始API数据转换"""
+        return cls(
+            job_category=raw.get("Std_class", ""),
+            job_title=raw.get("aca112", ""),
+            company=raw.get("AAB004", ""),
+            company_type=raw.get("AAB019", "").strip(),
+            salary=raw.get("acb241", ""),
+            location=raw.get("aab302", ""),
+            address=raw.get("AAE006", ""),
+            publish_date=raw.get("aae397", ""),
+            collect_time=raw.get("Collect_time", ""),
+            url=raw.get("ACE760", ""),
+            description=raw.get("acb22a", ""),
+            experience=raw.get("Experience", ""),
+            education=raw.get("aac011", ""),
+            headcount=raw.get("acb240", ""),
+            industry=raw.get("AAB022", ""),
+            company_size=raw.get("Num_employers", ""),
+            contact=raw.get("AAE004", ""),
+            company_intro=raw.get("AAB092", ""),
+        )
--- a/job_crawler/app/models/progress.py
+++ b/job_crawler/app/models/progress.py
@@ -0,0 +1,24 @@
+"""采集进度模型"""
+from pydantic import BaseModel
+
+
+class CrawlProgress(BaseModel):
+    """采集进度"""
+    task_id: str
+    current_offset: int = 0
+    total: int = 0
+    last_update: str = ""
+    status: str = "idle"  # idle, running, completed, error
+
+
+class CrawlStatus(BaseModel):
+    """采集状态响应"""
+    task_id: str
+    total: int
+    current_offset: int
+    progress: str
+    kafka_lag: int = 0
+    status: str
+    last_update: str
+    filtered_count: int = 0
+    produced_count: int = 0
--- a/job_crawler/app/models/response.py
+++ b/job_crawler/app/models/response.py
@@ -0,0 +1,23 @@
+"""API响应模型"""
+from pydantic import BaseModel
+from typing import Optional, Any
+
+
+class ApiResponse(BaseModel):
+    """通用API响应"""
+    code: int = 0
+    message: str = "success"
+    data: Optional[Any] = None
+
+
+class ConsumeResponse(BaseModel):
+    """消费响应"""
+    code: int = 0
+    data: list = []
+    count: int = 0
+
+
+class StatusResponse(BaseModel):
+    """状态响应"""
+    code: int = 0
+    data: dict = {}