commit 48dbaeacd5073c7c1be8b3ff96c5a69e4c4acf4f Author: 李顺东 <577732344@qq.com> Date: Mon Jan 12 22:49:25 2026 +0800 init diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0d38376 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.10-slim + +WORKDIR /app + +# 1. 替换阿里源加速 +RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ + +# 2. 安装系统依赖 +# 增加 libgl1 和 libglib2.0-0 解决 cv2 缺库问题 +RUN apt-get update && \ + apt-get install -y --no-install-recommends libgomp1 poppler-utils catdoc libgl1 libglib2.0-0 && \ + rm -rf /var/lib/apt/lists/* + +# 3. 安装 Python 库 +RUN pip install --no-cache-dir \ + fastapi uvicorn websockets \ + rapidocr-onnxruntime \ + numpy requests \ + python-pptx openpyxl xlrd \ + pdf2image Pillow python-multipart + +# 4. 复制代码 +COPY . . + +EXPOSE 9000 + +# 确保这里是 app.py +CMD ["python", "app.py"] \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..622a08f --- /dev/null +++ b/app.py @@ -0,0 +1,231 @@ +from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Body +from rapidocr_onnxruntime import RapidOCR +from pdf2image import convert_from_bytes +from PIL import Image +import numpy as np +import io +import time +import logging +import requests +import os +import tempfile +import subprocess + +# 新增库引入 +from openpyxl import load_workbook +from pptx import Presentation +import xlrd # 用于处理 .xls + +# 配置日志 +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI(title="OCR Service") + +# 1. 全局初始化 OCR 模型 +try: + logger.info("正在加载OCR模型...") + ocr_engine = RapidOCR() + logger.info("OCR模型加载完成!") +except Exception as e: + logger.error(f"模型加载失败: {e}") + raise e + +# --- 核心处理逻辑函数 --- + +def ocr_single_image(img_np): + """辅助函数:处理单张图片的 Numpy 数组并返回文本""" + try: + result, _ = ocr_engine(img_np) + txt_res = [] + if result: + for line in result: + txt_res.append(line[1]) + return "\n".join(txt_res) + except Exception as e: + logger.error(f"单图识别出错: {e}") + return "" + +def extract_from_excel(file_bytes): + """解析 Excel (.xlsx) 文本""" + try: + wb = load_workbook(io.BytesIO(file_bytes), data_only=True) + texts = [] + for sheet in wb.worksheets: + texts.append(f"--- Sheet: {sheet.title} ---") + for row in sheet.iter_rows(values_only=True): + row_text = " ".join([str(cell) for cell in row if cell is not None]) + if row_text.strip(): + texts.append(row_text) + return "\n".join(texts) + except Exception as e: + logger.error(f"Excel(.xlsx)解析失败: {e}") + raise HTTPException(status_code=400, detail=f"xlsx解析失败: {str(e)}") + +def extract_from_xls(file_bytes): + """解析旧版 Excel (.xls) 文本""" + try: + # xlrd 支持直接从内存读取 + wb = xlrd.open_workbook(file_contents=file_bytes) + texts = [] + for sheet in wb.sheets(): + texts.append(f"--- Sheet: {sheet.name} ---") + for row_idx in range(sheet.nrows): + row = sheet.row(row_idx) + # xlrd 的 cell 对象需要取 .value + row_text = " ".join([str(c.value) for c in row if c.value not in ('', None)]) + if row_text.strip(): + texts.append(row_text) + return "\n".join(texts) + except Exception as e: + logger.error(f"Excel(.xls)解析失败: {e}") + raise HTTPException(status_code=400, detail=f"xls解析失败: {str(e)}") + +def extract_from_ppt(file_bytes): + """解析 PPT (.pptx) 文本""" + try: + prs = Presentation(io.BytesIO(file_bytes)) + texts = [] + for i, slide in enumerate(prs.slides): + page_content = [] + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text: + page_content.append(shape.text) + if page_content: + texts.append(f"--- Slide {i+1} ---") + texts.append("\n".join(page_content)) + return "\n\n".join(texts) + except Exception as e: + logger.error(f"PPT(.pptx)解析失败: {e}") + raise HTTPException(status_code=400, detail="pptx解析失败") + +def extract_from_ppt_legacy(file_bytes): + """解析旧版 PPT (.ppt) 文本""" + # .ppt 是二进制 OLE 格式,纯 Python 库支持极差 + # 这里使用系统级工具 catppt (来自 catdoc 包) 进行提取 + temp_file = None + try: + # 创建临时文件,因为 catppt 需要文件路径 + with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as tmp: + tmp.write(file_bytes) + temp_file = tmp.name + + # 调用系统命令 catppt + # -d utf-8 尝试强制输出 utf-8 (视 catdoc 版本而定,通常默认即可) + process = subprocess.Popen(['catppt', temp_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + + if process.returncode != 0: + logger.error(f"catppt error: {stderr.decode()}") + raise Exception("解析进程异常退出") + + # 尝试解码 + return stdout.decode('utf-8', errors='ignore') + + except Exception as e: + logger.error(f"PPT(.ppt)解析失败: {e}") + raise HTTPException(status_code=400, detail="ppt解析失败 (请检查文件是否损坏)") + finally: + # 清理临时文件 + if temp_file and os.path.exists(temp_file): + os.remove(temp_file) + +def process_file_bytes(file_bytes, filename): + """通用文件流处理入口""" + filename = filename.lower() + + # 1. PDF 处理 + if filename.endswith('.pdf'): + try: + images = convert_from_bytes(file_bytes, dpi=200, fmt='jpeg') + texts = [] + for i, img in enumerate(images): + img_np = np.array(img) + texts.append(f"--- Page {i+1} ---\n" + ocr_single_image(img_np)) + return "\n\n".join(texts) + except Exception as e: + raise HTTPException(status_code=500, detail="PDF解析失败") + + # 2. Excel 处理 + elif filename.endswith('.xlsx'): + return extract_from_excel(file_bytes) + elif filename.endswith('.xls'): + return extract_from_xls(file_bytes) + + # 3. PPT 处理 + elif filename.endswith('.pptx'): + return extract_from_ppt(file_bytes) + elif filename.endswith('.ppt'): + return extract_from_ppt_legacy(file_bytes) + + # 4. 图片处理 + else: + try: + img = Image.open(io.BytesIO(file_bytes)) + img_np = np.array(img) + return ocr_single_image(img_np) + except Exception: + raise HTTPException(status_code=400, detail=f"不支持的文件格式: {filename}") + +# --- 接口定义 --- + +@app.get("/health") +def health_check(): + return {"status": "ok"} + +@app.post("/ocr") +async def recognize_file(file: UploadFile = File(...)): + """ + 文件上传接口 + 支持: 图片, PDF, Excel(.xlsx/.xls), PPT(.pptx/.ppt) + """ + start_time = time.time() + try: + content = await file.read() + result_text = process_file_bytes(content, file.filename) + + cost = (time.time() - start_time) * 1000 + return {"code": 200, "data": result_text, "cost_time_ms": cost} + except HTTPException as he: + raise he + except Exception as e: + logger.error(f"处理异常: {e}") + return {"code": 500, "msg": str(e)} + +@app.post("/ocr/url") +async def recognize_url(url: str = Body(..., embed=True)): + """ + URL 识别接口 + Body参数: {"url": "http://example.com/file.png"} + """ + start_time = time.time() + try: + logger.info(f"正在下载文件: {url}") + resp = requests.get(url, timeout=15) + if resp.status_code != 200: + raise HTTPException(status_code=400, detail=f"下载失败,状态码: {resp.status_code}") + + filename = os.path.basename(url.split("?")[0]) + if not filename or "." not in filename: + content_type = resp.headers.get('Content-Type', '') + if 'pdf' in content_type: filename = 'temp.pdf' + elif 'sheet' in content_type or 'excel' in content_type: + # 简单判断,这里默认给 xlsx,如果是 xls 可能需要更复杂的 magic number 判断 + filename = 'temp.xlsx' + elif 'presentation' in content_type: filename = 'temp.pptx' + else: filename = 'temp.jpg' + + result_text = process_file_bytes(resp.content, filename) + + cost = (time.time() - start_time) * 1000 + return {"code": 200, "data": result_text, "cost_time_ms": cost} + + except HTTPException as he: + raise he + except Exception as e: + logger.error(f"URL处理异常: {e}") + return {"code": 500, "msg": str(e)} + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=9000) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8c2108d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,36 @@ +version: '3.8' + +services: + ocr-server: + # 方式A: 如果要在当前目录直接构建镜像,保留 build (推荐) + build: "" + # 方式B: 如果使用已经打好的镜像,注释掉 build,打开 image + # image: my-ocr-service + + container_name: ocr-server + restart: always + ports: + - "9000:9000" + + environment: + - TZ=Asia/Shanghai + # 如果将来有其他配置,可以在这里添加 + # - WORKERS=4 + + # --- 日志管理配置 --- + # 默认情况下 docker logs 会无限增长,这里限制大小 + logging: + driver: "json-file" + options: + max-size: "50m" # 单个日志文件最大 50MB + max-file: "3" # 最多保留 3 个日志文件 (轮转) + + # --- 健康检查配置 --- + # 每隔 30秒 访问一次 /health 接口 + healthcheck: + # 需要在 Dockerfile 中安装 curl + test: ["CMD-SHELL", "curl -f http://localhost:9000/health || exit 1"] + interval: 30s # 检查间隔 + timeout: 10s # 超时时间 + retries: 3 # 失败重试次数,超过3次标记为 unhealthy + start_period: 10s # 启动后前10秒不计入检查(给模型加载预留时间) \ No newline at end of file diff --git a/example/test_resume.pdf b/example/test_resume.pdf new file mode 100644 index 0000000..8db12f9 Binary files /dev/null and b/example/test_resume.pdf differ diff --git a/ocr-interface_doc.html b/ocr-interface_doc.html new file mode 100644 index 0000000..5d92fd6 --- /dev/null +++ b/ocr-interface_doc.html @@ -0,0 +1,229 @@ + + + + + + OCR 服务接口文档 + + + + +

OCR 服务接口文档

+ +
+

版本: 1.1.1 (Update)

+ +
+
+ 外网 Base URL: + https://qd.zhaopinzao8dian.com/ocr-api +
+
+ 内网 Base URL: + http://127.0.0.1:9001 +
+
+

支持格式: 图片 (JPG/PNG), PDF, Excel (.xlsx / .xls), PPT (.pptx / .ppt)

+
+ +
+

通用响应结构

+

所有接口请求成功均返回如下 JSON 结构:

+
{
+  "code": 200,                // 业务状态码 (200成功, 500错误)
+  "data": "识别出的文本内容...", // 具体的文本结果
+  "cost_time_ms": 150.5,      // 处理耗时 (毫秒)
+  "msg": ""                   // 错误信息 (仅出错时存在)
+}
+
+ +
+

1. 文件上传识别

+
+ POST + /ocr +
+

通过 `multipart/form-data` 表单上传本地文件进行识别。

+ +

请求参数 (Form-Data)

+ + + + + + + + + + + + + + + + + +
参数名类型必填说明
fileFile二进制文件流。
支持:图片, PDF, Excel(.xlsx/.xls), PPT(.pptx/.ppt)
+ +

CURL 示例 (内网)

+
curl -X POST -F "file=@/path/to/old_data.xls" http://127.0.0.1:9001/ocr
+ +

响应示例

+
{
+  "code": 200,
+  "data": "--- Sheet: Sheet1 ---\n这里是Excel中的文本内容...",
+  "cost_time_ms": 320.5
+}
+
+ +
+

2. URL 网络文件识别

+
+ POST + /ocr/url +
+

提交一个可访问的文件链接,服务自动下载并识别。

+ +

请求头 (Headers)

+ + +

请求参数 (JSON Body)

+ + + + + + + + + + + + + + + + + +
参数名类型必填说明
urlString完整的下载链接 (需以 http/https 开头)
+ +

CURL 示例 (内网)

+
curl -X POST -H "Content-Type: application/json" \
+     -d '{"url": "https://example.com/legacy_slides.ppt"}' \
+     http://127.0.0.1:9001/ocr/url
+
+ +
+ +
+

⚠️ 注意事项

+ +
+ + + \ No newline at end of file diff --git a/ream_me.txt b/ream_me.txt new file mode 100644 index 0000000..4825a0e --- /dev/null +++ b/ream_me.txt @@ -0,0 +1,24 @@ +# 停止旧服务 +docker-compose down + +# 重新构建 (注意末尾有个点) +docker build -t my-ocr-service:v1.0 . + +docker-compose up -d --force-recreate + +docker save -o ocr-server-v1.0.tar my-ocr-service:v1.0 + +scp -i ~/.ssh/39.98.44.136.pem ocr-server-v1.0.tar root@39.98.44.136:/root/docker-app/ocr + +#39.98.44.136 服务器 +docker load -i ocr-server-v1.0.tar + +docker-compose up -d --force-recreate + +docker logs -f ocr-server + +# 测试 +curl -X POST -F "file=@/root/ocr-service/test_resume.pdf" http://127.0.0.1:9000/ocr + +# 测试 +curl -X POST -H "Content-Type: application/json" -d '{"url": "https://qd.zhaopinzao8dian.com/file/test_resume.pdf"}' http://127.0.0.1:9000/ocr/url \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2eb0e9b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +fastapi==0.95.2 +uvicorn==0.22.0 +python-multipart==0.0.6 +rapidocr_onnxruntime==1.3.8 +numpy +Pillow +pdf2image +openpyxl +python-pptx +requests +xlrd>=2.0.1 \ No newline at end of file diff --git a/yml/docker-compose.yml b/yml/docker-compose.yml new file mode 100644 index 0000000..a2d8974 --- /dev/null +++ b/yml/docker-compose.yml @@ -0,0 +1,25 @@ +version: '3.8' + +services: + ocr-server: + image: my-ocr-service:v1.0 + + container_name: ocr-server + restart: always + ports: + - "9000:9000" + environment: + - TZ=Asia/Shanghai + + logging: + driver: "json-file" + options: + max-size: "50m" + max-file: "3" + + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:9000/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s \ No newline at end of file