feat(job_crawler): initialize job crawler service with kafka integration
- Add technical documentation (技术方案.md) with system architecture and design details - Create FastAPI application structure with modular organization (api, core, models, services, utils) - Implement job data crawler service with incremental collection from third-party API - Add Kafka service integration with Docker Compose configuration for message queue - Create data models for job listings, progress tracking, and API responses - Implement REST API endpoints for data consumption (/consume, /status) and task management - Add progress persistence layer using SQLite for tracking collection offsets - Implement date filtering logic to extract data published within 7 days - Create API client service for third-party data source integration - Add configuration management with environment-based settings - Include Docker support with Dockerfile and docker-compose.yml for containerized deployment - Add logging configuration and utility functions for date parsing - Include requirements.txt with all Python dependencies and README documentation
This commit is contained in:
4
job_crawler/app/utils/__init__.py
Normal file
4
job_crawler/app/utils/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""工具模块"""
|
||||
from .date_parser import parse_aae397, parse_collect_time, is_within_days
|
||||
|
||||
__all__ = ["parse_aae397", "parse_collect_time", "is_within_days"]
|
||||
74
job_crawler/app/utils/date_parser.py
Normal file
74
job_crawler/app/utils/date_parser.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""日期解析工具"""
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def parse_aae397(date_str: str) -> Optional[datetime]:
|
||||
"""
|
||||
解析发布日期字段 aae397
|
||||
支持格式:
|
||||
- "今天"
|
||||
- "1月13日"
|
||||
- "12月31日"
|
||||
"""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
date_str = date_str.strip()
|
||||
today = datetime.now()
|
||||
|
||||
# 处理 "今天"
|
||||
if date_str == "今天":
|
||||
return today
|
||||
|
||||
# 处理 "X月X日" 格式
|
||||
pattern = r"(\d{1,2})月(\d{1,2})日"
|
||||
match = re.match(pattern, date_str)
|
||||
if match:
|
||||
month = int(match.group(1))
|
||||
day = int(match.group(2))
|
||||
year = today.year
|
||||
|
||||
try:
|
||||
parsed_date = datetime(year, month, day)
|
||||
if parsed_date > today:
|
||||
parsed_date = datetime(year - 1, month, day)
|
||||
return parsed_date
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def parse_collect_time(date_str: str) -> Optional[datetime]:
|
||||
"""
|
||||
解析采集时间字段 Collect_time
|
||||
格式: "2026-01-15"
|
||||
"""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
return datetime.strptime(date_str.strip(), "%Y-%m-%d")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def is_within_days(date_str: str, collect_time_str: str, days: int = 7) -> bool:
|
||||
"""
|
||||
判断数据是否在指定天数内
|
||||
条件: 发布日期 AND 采集时间 都在指定天数内
|
||||
"""
|
||||
today = datetime.now()
|
||||
cutoff_date = today - timedelta(days=days)
|
||||
|
||||
publish_date = parse_aae397(date_str)
|
||||
if publish_date is None:
|
||||
return False
|
||||
|
||||
collect_date = parse_collect_time(collect_time_str)
|
||||
if collect_date is None:
|
||||
return False
|
||||
|
||||
return publish_date >= cutoff_date and collect_date >= cutoff_date
|
||||
Reference in New Issue
Block a user