Files
wechat_crawler/job_extractor.py
李顺东 b66bac7ca8 feat: Initialize wxauto WeChat automation project with job extraction tools
- Add wxauto package with WeChat UI automation and message handling capabilities
- Implement job_extractor.py for automated job posting extraction from WeChat groups
- Add job_extractor_gui.py providing graphical interface for job extraction tool
- Create comprehensive documentation in Chinese covering GUI usage, multi-group support, and quick start guides
- Add build configuration files (build_exe.py, build_exe.spec) for packaging as standalone executable
- Include utility scripts for WeChat interaction (auto_send_msg.py, get_history.py, receive_file_transfer.py)
- Add project configuration files (pyproject.toml, setup.cfg, requirements.txt)
- Include test files (test_api.py, test_com_fix.py) for API and compatibility validation
- Add Apache 2.0 LICENSE and comprehensive README documentation
- Configure .gitignore to exclude build artifacts, logs, and temporary files
2026-02-11 14:49:38 +08:00

224 lines
6.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
从指定微信群拉取消息使用百炼API提取岗位结构化数据
运行前请确保1) 已安装依赖 pip install -e .
2) 电脑已登录微信 3.9 版本,且主窗口已打开
3) 已配置 config.json 文件
"""
import sys
import os
import json
import time
import requests
from datetime import datetime
_script_dir = os.path.dirname(os.path.abspath(__file__))
if _script_dir not in sys.path:
sys.path.insert(0, _script_dir)
from wxauto import WeChat
def load_config():
"""加载配置文件"""
config_path = os.path.join(_script_dir, "config.json")
if not os.path.exists(config_path):
print(f"配置文件不存在: {config_path}")
return None
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f)
def extract_job_info(message_content, api_url, api_key):
"""使用百炼API提取岗位信息"""
prompt = f"""请从以下消息中提取招聘岗位信息并以JSON格式返回。如果消息不包含招聘信息返回空对象。
要提取的字段:
- job_name: 工作名称
- job_description: 工作描述
- job_location: 工作地点
- salary_min: 月薪最低(数字,单位:元)
- salary_max: 月薪最高(数字,单位:元)
- company_name: 公司名称
- contact_person: 联系人
- contact_info: 联系方式(电话/微信等)
消息内容:
{message_content}
请直接返回JSON格式不要包含其他说明文字。"""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "qwen-plus",
"input": {
"messages": [
{
"role": "system",
"content": "你是一个专业的招聘信息提取助手,擅长从文本中提取结构化的岗位信息。"
},
{
"role": "user",
"content": prompt
}
]
},
"parameters": {
"result_format": "message"
}
}
try:
response = requests.post(api_url, headers=headers, json=payload, timeout=30)
response.raise_for_status()
result = response.json()
if "output" in result and "choices" in result["output"]:
content = result["output"]["choices"][0]["message"]["content"]
# 尝试解析JSON
try:
job_data = json.loads(content)
return job_data if job_data else None
except json.JSONDecodeError:
print(f"API返回内容无法解析为JSON: {content}")
return None
else:
print(f"API返回格式异常: {result}")
return None
except requests.exceptions.RequestException as e:
print(f"API请求失败: {e}")
return None
except Exception as e:
print(f"提取岗位信息时发生错误: {e}")
return None
def save_job_data(job_data, output_file):
"""保存岗位数据到文件"""
output_path = os.path.join(_script_dir, output_file)
# 读取现有数据
existing_data = []
if os.path.exists(output_path):
try:
with open(output_path, "r", encoding="utf-8") as f:
existing_data = json.load(f)
except:
existing_data = []
# 添加新数据
existing_data.append(job_data)
# 保存
with open(output_path, "w", encoding="utf-8") as f:
json.dump(existing_data, f, ensure_ascii=False, indent=2)
print(f"岗位数据已保存到: {output_path}")
def on_message(msg, chat, config):
"""消息处理回调函数"""
print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 收到新消息")
print(f"发送者: {msg.sender}")
print(f"消息类型: {msg.type}")
print(f"消息内容: {msg.content}")
# 只处理文本消息
if msg.type != "text" or not msg.content:
print("跳过非文本消息")
return
# 使用百炼API提取岗位信息
print("正在分析消息内容...")
job_info = extract_job_info(
msg.content,
config["bailian_api_url"],
config["api_key"]
)
if job_info and any(job_info.values()):
print("✓ 提取到岗位信息:")
print(json.dumps(job_info, ensure_ascii=False, indent=2))
# 添加元数据
job_info["_metadata"] = {
"source": "wechat_group",
"group_name": config["target_group"],
"sender": msg.sender,
"extract_time": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
"original_message": msg.content
}
# 保存数据
save_job_data(job_info, config["output_file"])
else:
print("× 未提取到有效岗位信息")
def main():
print("=" * 60)
print("微信群岗位信息提取工具")
print("=" * 60)
# 加载配置
config = load_config()
if not config:
return
target_group = config.get("target_group", "")
if not target_group:
print("错误: 配置文件中未指定 target_group")
return
print(f"\n配置信息:")
print(f" 目标群组: {target_group}")
print(f" 输出文件: {config.get('output_file', 'jobs_data.json')}")
print(f" 检查间隔: {config.get('check_interval', 5)}")
# 连接微信
print("\n正在连接微信...")
try:
wx = WeChat()
print(f"✓ 已连接微信,当前用户: {wx.nickname}")
except Exception as e:
print(f"× 连接失败: {e}")
print("请确保:")
print(" 1. 已安装依赖: pip install -e .")
print(" 2. 微信 3.9 已登录并保持主窗口打开")
return
# 添加监听
print(f"\n正在添加监听: {target_group}")
result = wx.AddListenChat(
nickname=target_group,
callback=lambda msg, chat: on_message(msg, chat, config)
)
if isinstance(result, str) and "失败" in result:
print(f"× 添加监听失败: {result}")
print(f"提示: 请确保群名称 '{target_group}' 正确")
return
print(f"✓ 成功监听群组: {target_group}")
print("\n开始监听消息...")
print("按 Ctrl+C 停止监听\n")
print("-" * 60)
# 保持运行
try:
wx.KeepRunning()
except KeyboardInterrupt:
print("\n\n正在停止监听...")
wx.StopListening()
print("程序已退出")
if __name__ == "__main__":
main()