feat(job_crawler): initialize job crawler service with kafka integration

- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
2026-01-15 17:09:43 +08:00
commit ae681575b9
26 changed files with 1898 additions and 0 deletions
--- a/job_crawler/app/utils/init.py
+++ b/job_crawler/app/utils/init.py
@@ -0,0 +1,4 @@
+"""工具模块"""
+from .date_parser import parse_aae397, parse_collect_time, is_within_days
+
+__all__ = ["parse_aae397", "parse_collect_time", "is_within_days"]
--- a/job_crawler/app/utils/date_parser.py
+++ b/job_crawler/app/utils/date_parser.py
@@ -0,0 +1,74 @@
+"""日期解析工具"""
+import re
+from datetime import datetime, timedelta
+from typing import Optional
+
+
+def parse_aae397(date_str: str) -> Optional[datetime]:
+    """
+    解析发布日期字段 aae397
+    支持格式:
+    - "今天"
+    - "1月13日"
+    - "12月31日"
+    """
+    if not date_str:
+        return None
+    
+    date_str = date_str.strip()
+    today = datetime.now()
+    
+    # 处理 "今天"
+    if date_str == "今天":
+        return today
+    
+    # 处理 "X月X日" 格式
+    pattern = r"(\d{1,2})月(\d{1,2})日"
+    match = re.match(pattern, date_str)
+    if match:
+        month = int(match.group(1))
+        day = int(match.group(2))
+        year = today.year
+        
+        try:
+            parsed_date = datetime(year, month, day)
+            if parsed_date > today:
+                parsed_date = datetime(year - 1, month, day)
+            return parsed_date
+        except ValueError:
+            return None
+    
+    return None
+
+
+def parse_collect_time(date_str: str) -> Optional[datetime]:
+    """
+    解析采集时间字段 Collect_time
+    格式: "2026-01-15"
+    """
+    if not date_str:
+        return None
+    
+    try:
+        return datetime.strptime(date_str.strip(), "%Y-%m-%d")
+    except ValueError:
+        return None
+
+
+def is_within_days(date_str: str, collect_time_str: str, days: int = 7) -> bool:
+    """
+    判断数据是否在指定天数内
+    条件: 发布日期 AND 采集时间 都在指定天数内
+    """
+    today = datetime.now()
+    cutoff_date = today - timedelta(days=days)
+    
+    publish_date = parse_aae397(date_str)
+    if publish_date is None:
+        return False
+    
+    collect_date = parse_collect_time(collect_time_str)
+    if collect_date is None:
+        return False
+    
+    return publish_date >= cutoff_date and collect_date >= cutoff_date