init

2026-01-12 22:49:25 +08:00
commit 48dbaeacd5
8 changed files with 584 additions and 0 deletions
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
+FROM python:3.10-slim
+
+WORKDIR /app
+
+# 1. 替换阿里源加速
+RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+
+# 2. 安装系统依赖
+# 增加 libgl1 和 libglib2.0-0 解决 cv2 缺库问题
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libgomp1 poppler-utils catdoc libgl1 libglib2.0-0 && \
+    rm -rf /var/lib/apt/lists/*
+
+# 3. 安装 Python 库
+RUN pip install --no-cache-dir \
+    fastapi uvicorn websockets \
+    rapidocr-onnxruntime \
+    numpy requests \
+    python-pptx openpyxl xlrd \
+    pdf2image Pillow python-multipart
+
+# 4. 复制代码
+COPY . .
+
+EXPOSE 9000
+
+# 确保这里是 app.py
+CMD ["python", "app.py"]
--- a/app.py
+++ b/app.py
@@ -0,0 +1,231 @@
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Body
+from rapidocr_onnxruntime import RapidOCR
+from pdf2image import convert_from_bytes
+from PIL import Image
+import numpy as np
+import io
+import time
+import logging
+import requests
+import os
+import tempfile
+import subprocess
+
+# 新增库引入
+from openpyxl import load_workbook
+from pptx import Presentation
+import xlrd  # 用于处理 .xls
+
+# 配置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = FastAPI(title="OCR Service")
+
+# 1. 全局初始化 OCR 模型
+try:
+    logger.info("正在加载OCR模型...")
+    ocr_engine = RapidOCR()
+    logger.info("OCR模型加载完成！")
+except Exception as e:
+    logger.error(f"模型加载失败: {e}")
+    raise e
+
+# --- 核心处理逻辑函数 ---
+
+def ocr_single_image(img_np):
+    """辅助函数：处理单张图片的 Numpy 数组并返回文本"""
+    try:
+        result, _ = ocr_engine(img_np)
+        txt_res = []
+        if result:
+            for line in result:
+                txt_res.append(line[1])
+        return "\n".join(txt_res)
+    except Exception as e:
+        logger.error(f"单图识别出错: {e}")
+        return ""
+
+def extract_from_excel(file_bytes):
+    """解析 Excel (.xlsx) 文本"""
+    try:
+        wb = load_workbook(io.BytesIO(file_bytes), data_only=True)
+        texts = []
+        for sheet in wb.worksheets:
+            texts.append(f"--- Sheet: {sheet.title} ---")
+            for row in sheet.iter_rows(values_only=True):
+                row_text = " ".join([str(cell) for cell in row if cell is not None])
+                if row_text.strip():
+                    texts.append(row_text)
+        return "\n".join(texts)
+    except Exception as e:
+        logger.error(f"Excel(.xlsx)解析失败: {e}")
+        raise HTTPException(status_code=400, detail=f"xlsx解析失败: {str(e)}")
+
+def extract_from_xls(file_bytes):
+    """解析旧版 Excel (.xls) 文本"""
+    try:
+        # xlrd 支持直接从内存读取
+        wb = xlrd.open_workbook(file_contents=file_bytes)
+        texts = []
+        for sheet in wb.sheets():
+            texts.append(f"--- Sheet: {sheet.name} ---")
+            for row_idx in range(sheet.nrows):
+                row = sheet.row(row_idx)
+                # xlrd 的 cell 对象需要取 .value
+                row_text = " ".join([str(c.value) for c in row if c.value not in ('', None)])
+                if row_text.strip():
+                    texts.append(row_text)
+        return "\n".join(texts)
+    except Exception as e:
+        logger.error(f"Excel(.xls)解析失败: {e}")
+        raise HTTPException(status_code=400, detail=f"xls解析失败: {str(e)}")
+
+def extract_from_ppt(file_bytes):
+    """解析 PPT (.pptx) 文本"""
+    try:
+        prs = Presentation(io.BytesIO(file_bytes))
+        texts = []
+        for i, slide in enumerate(prs.slides):
+            page_content = []
+            for shape in slide.shapes:
+                if hasattr(shape, "text") and shape.text:
+                    page_content.append(shape.text)
+            if page_content:
+                texts.append(f"--- Slide {i+1} ---")
+                texts.append("\n".join(page_content))
+        return "\n\n".join(texts)
+    except Exception as e:
+        logger.error(f"PPT(.pptx)解析失败: {e}")
+        raise HTTPException(status_code=400, detail="pptx解析失败")
+
+def extract_from_ppt_legacy(file_bytes):
+    """解析旧版 PPT (.ppt) 文本"""
+    # .ppt 是二进制 OLE 格式，纯 Python 库支持极差
+    # 这里使用系统级工具 catppt (来自 catdoc 包) 进行提取
+    temp_file = None
+    try:
+        # 创建临时文件，因为 catppt 需要文件路径
+        with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as tmp:
+            tmp.write(file_bytes)
+            temp_file = tmp.name
+
+        # 调用系统命令 catppt
+        # -d utf-8 尝试强制输出 utf-8 (视 catdoc 版本而定，通常默认即可)
+        process = subprocess.Popen(['catppt', temp_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+
+        if process.returncode != 0:
+            logger.error(f"catppt error: {stderr.decode()}")
+            raise Exception("解析进程异常退出")
+
+        # 尝试解码
+        return stdout.decode('utf-8', errors='ignore')
+
+    except Exception as e:
+        logger.error(f"PPT(.ppt)解析失败: {e}")
+        raise HTTPException(status_code=400, detail="ppt解析失败 (请检查文件是否损坏)")
+    finally:
+        # 清理临时文件
+        if temp_file and os.path.exists(temp_file):
+            os.remove(temp_file)
+
+def process_file_bytes(file_bytes, filename):
+    """通用文件流处理入口"""
+    filename = filename.lower()
+
+    # 1. PDF 处理
+    if filename.endswith('.pdf'):
+        try:
+            images = convert_from_bytes(file_bytes, dpi=200, fmt='jpeg')
+            texts = []
+            for i, img in enumerate(images):
+                img_np = np.array(img)
+                texts.append(f"--- Page {i+1} ---\n" + ocr_single_image(img_np))
+            return "\n\n".join(texts)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail="PDF解析失败")
+
+    # 2. Excel 处理
+    elif filename.endswith('.xlsx'):
+        return extract_from_excel(file_bytes)
+    elif filename.endswith('.xls'):
+        return extract_from_xls(file_bytes)
+
+    # 3. PPT 处理
+    elif filename.endswith('.pptx'):
+        return extract_from_ppt(file_bytes)
+    elif filename.endswith('.ppt'):
+        return extract_from_ppt_legacy(file_bytes)
+
+    # 4. 图片处理
+    else:
+        try:
+            img = Image.open(io.BytesIO(file_bytes))
+            img_np = np.array(img)
+            return ocr_single_image(img_np)
+        except Exception:
+            raise HTTPException(status_code=400, detail=f"不支持的文件格式: {filename}")
+
+# --- 接口定义 ---
+
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}
+
+@app.post("/ocr")
+async def recognize_file(file: UploadFile = File(...)):
+    """
+    文件上传接口
+    支持: 图片, PDF, Excel(.xlsx/.xls), PPT(.pptx/.ppt)
+    """
+    start_time = time.time()
+    try:
+        content = await file.read()
+        result_text = process_file_bytes(content, file.filename)
+
+        cost = (time.time() - start_time) * 1000
+        return {"code": 200, "data": result_text, "cost_time_ms": cost}
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        logger.error(f"处理异常: {e}")
+        return {"code": 500, "msg": str(e)}
+
+@app.post("/ocr/url")
+async def recognize_url(url: str = Body(..., embed=True)):
+    """
+    URL 识别接口
+    Body参数: {"url": "http://example.com/file.png"}
+    """
+    start_time = time.time()
+    try:
+        logger.info(f"正在下载文件: {url}")
+        resp = requests.get(url, timeout=15)
+        if resp.status_code != 200:
+            raise HTTPException(status_code=400, detail=f"下载失败，状态码: {resp.status_code}")
+
+        filename = os.path.basename(url.split("?")[0])
+        if not filename or "." not in filename:
+            content_type = resp.headers.get('Content-Type', '')
+            if 'pdf' in content_type: filename = 'temp.pdf'
+            elif 'sheet' in content_type or 'excel' in content_type:
+                # 简单判断，这里默认给 xlsx，如果是 xls 可能需要更复杂的 magic number 判断
+                filename = 'temp.xlsx'
+            elif 'presentation' in content_type: filename = 'temp.pptx'
+            else: filename = 'temp.jpg'
+
+        result_text = process_file_bytes(resp.content, filename)
+
+        cost = (time.time() - start_time) * 1000
+        return {"code": 200, "data": result_text, "cost_time_ms": cost}
+
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        logger.error(f"URL处理异常: {e}")
+        return {"code": 500, "msg": str(e)}
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=9000)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,36 @@
+version: '3.8'
+
+services:
+  ocr-server:
+    # 方式A: 如果要在当前目录直接构建镜像，保留 build (推荐)
+    build: ""
+    # 方式B: 如果使用已经打好的镜像，注释掉 build，打开 image
+    # image: my-ocr-service
+
+    container_name: ocr-server
+    restart: always
+    ports:
+      - "9000:9000"
+
+    environment:
+      - TZ=Asia/Shanghai
+      # 如果将来有其他配置，可以在这里添加
+      # - WORKERS=4 
+
+    # --- 日志管理配置 ---
+    # 默认情况下 docker logs 会无限增长，这里限制大小
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "50m"  # 单个日志文件最大 50MB
+        max-file: "3"    # 最多保留 3 个日志文件 (轮转)
+
+    # --- 健康检查配置 ---
+    # 每隔 30秒 访问一次 /health 接口
+    healthcheck:
+      # 需要在 Dockerfile 中安装 curl
+      test: ["CMD-SHELL", "curl -f http://localhost:9000/health || exit 1"]
+      interval: 30s      # 检查间隔
+      timeout: 10s       # 超时时间
+      retries: 3         # 失败重试次数，超过3次标记为 unhealthy
+      start_period: 10s  # 启动后前10秒不计入检查（给模型加载预留时间）
--- a/example/test_resume.pdf
+++ b/example/test_resume.pdf
--- a/ocr-interface_doc.html
+++ b/ocr-interface_doc.html
@@ -0,0 +1,229 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>OCR 服务接口文档</title>
+  <style>
+    :root {
+      --primary-color: #2c3e50;
+      --accent-color: #3498db;
+      --bg-color: #f8f9fa;
+      --border-color: #e9ecef;
+      --code-bg: #f1f3f5;
+    }
+    body {
+      font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+      line-height: 1.6;
+      color: #333;
+      max-width: 1000px;
+      margin: 0 auto;
+      padding: 20px;
+      background-color: #fff;
+    }
+    h1, h2, h3 {
+      color: var(--primary-color);
+      border-bottom: 2px solid var(--border-color);
+      padding-bottom: 10px;
+      margin-top: 30px;
+    }
+    h1 { border-bottom: none; font-size: 2.5em; text-align: center; margin-bottom: 40px; }
+    .endpoint {
+      background: #fff;
+      border: 1px solid var(--border-color);
+      border-radius: 8px;
+      margin-bottom: 30px;
+      padding: 20px;
+      box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+    }
+    .method {
+      display: inline-block;
+      padding: 4px 8px;
+      border-radius: 4px;
+      color: #fff;
+      font-weight: bold;
+      font-size: 0.9em;
+      margin-right: 10px;
+    }
+    .post { background-color: #2ecc71; }
+    .url { font-family: monospace; font-size: 1.1em; color: var(--primary-color); }
+
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      margin: 15px 0;
+    }
+    th, td {
+      border: 1px solid var(--border-color);
+      padding: 10px;
+      text-align: left;
+    }
+    th { background-color: var(--bg-color); font-weight: 600; }
+
+    code, pre {
+      font-family: Consolas, Monaco, "Andale Mono", monospace;
+      background-color: var(--code-bg);
+      border-radius: 4px;
+    }
+    code { padding: 2px 4px; color: #e83e8c; }
+    pre {
+      padding: 15px;
+      overflow-x: auto;
+      border: 1px solid var(--border-color);
+    }
+    .note {
+      background-color: #e3f2fd;
+      border-left: 5px solid #2196f3;
+      padding: 15px;
+      margin: 20px 0;
+    }
+    .warning {
+      background-color: #fff3cd;
+      border-left: 5px solid #ffc107;
+      padding: 15px;
+      margin: 20px 0;
+    }
+    .url-box {
+      display: inline-block;
+      text-align: left;
+      background: #f8f9fa;
+      padding: 15px 20px;
+      border: 1px solid #e9ecef;
+      border-radius: 8px;
+      margin: 10px 0;
+    }
+    .url-row {
+      margin-bottom: 5px;
+    }
+    .url-row:last-child {
+      margin-bottom: 0;
+    }
+    .url-label {
+      font-weight: 600;
+      color: #2c3e50;
+      display: inline-block;
+      width: 110px;
+    }
+  </style>
+</head>
+<body>
+
+<h1>OCR 服务接口文档</h1>
+
+<div style="text-align: center; margin-bottom: 40px;">
+  <p><strong>版本:</strong> 1.1.1 (Update)</p>
+
+  <div class="url-box">
+    <div class="url-row">
+      <span class="url-label">外网 Base URL:</span>
+      <code>https://qd.zhaopinzao8dian.com/ocr-api</code>
+    </div>
+    <div class="url-row">
+      <span class="url-label">内网 Base URL:</span>
+      <code>http://127.0.0.1:9001</code>
+    </div>
+  </div>
+  <p style="margin-top: 15px;"><strong>支持格式:</strong> 图片 (JPG/PNG), PDF, Excel (.xlsx / .xls), PPT (.pptx / .ppt)</p>
+</div>
+
+<section>
+  <h2>通用响应结构</h2>
+  <p>所有接口请求成功均返回如下 JSON 结构：</p>
+  <pre>{
+  "code": 200,                // 业务状态码 (200成功, 500错误)
+  "data": "识别出的文本内容...", // 具体的文本结果
+  "cost_time_ms": 150.5,      // 处理耗时 (毫秒)
+  "msg": ""                   // 错误信息 (仅出错时存在)
+}</pre>
+</section>
+
+<section class="endpoint">
+  <h3>1. 文件上传识别</h3>
+  <div>
+    <span class="method post">POST</span>
+    <span class="url">/ocr</span>
+  </div>
+  <p>通过 `multipart/form-data` 表单上传本地文件进行识别。</p>
+
+  <h4>请求参数 (Form-Data)</h4>
+  <table>
+    <thead>
+    <tr>
+      <th width="20%">参数名</th>
+      <th width="15%">类型</th>
+      <th width="15%">必填</th>
+      <th>说明</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+      <td><code>file</code></td>
+      <td>File</td>
+      <td>是</td>
+      <td>二进制文件流。<br>支持：图片, PDF, Excel(.xlsx/.xls), PPT(.pptx/.ppt)</td>
+    </tr>
+    </tbody>
+  </table>
+
+  <h4>CURL 示例 (内网)</h4>
+  <pre>curl -X POST -F "file=@/path/to/old_data.xls" http://127.0.0.1:9001/ocr</pre>
+
+  <h4>响应示例</h4>
+  <pre>{
+  "code": 200,
+  "data": "--- Sheet: Sheet1 ---\n这里是Excel中的文本内容...",
+  "cost_time_ms": 320.5
+}</pre>
+</section>
+
+<section class="endpoint">
+  <h3>2. URL 网络文件识别</h3>
+  <div>
+    <span class="method post">POST</span>
+    <span class="url">/ocr/url</span>
+  </div>
+  <p>提交一个可访问的文件链接，服务自动下载并识别。</p>
+
+  <h4>请求头 (Headers)</h4>
+  <ul>
+    <li><code>Content-Type: application/json</code></li>
+  </ul>
+
+  <h4>请求参数 (JSON Body)</h4>
+  <table>
+    <thead>
+    <tr>
+      <th width="20%">参数名</th>
+      <th width="15%">类型</th>
+      <th width="15%">必填</th>
+      <th>说明</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+      <td><code>url</code></td>
+      <td>String</td>
+      <td>是</td>
+      <td>完整的下载链接 (需以 http/https 开头)</td>
+    </tr>
+    </tbody>
+  </table>
+
+  <h4>CURL 示例 (内网)</h4>
+  <pre>curl -X POST -H "Content-Type: application/json" \
+     -d '{"url": "https://example.com/legacy_slides.ppt"}' \
+     http://127.0.0.1:9001/ocr/url</pre>
+</section>
+
+<div class="note">
+
+  <div class="warning">
+    <h3>⚠️ 注意事项</h3>
+    <ul>
+      <li><strong>PDF 处理:</strong> PDF 会先转换为图片再进行 OCR，页数越多耗时越长，建议对大文件进行异步处理或分片上传。</li>
+      <li><strong>超时限制:</strong> URL 下载接口默认超时时间为 15 秒，请确保下载链接稳定。</li>
+    </ul>
+  </div>
+
+</body>
+</html>
--- a/ream_me.txt
+++ b/ream_me.txt
@@ -0,0 +1,24 @@
+# 停止旧服务
+docker-compose down
+
+# 重新构建 (注意末尾有个点)
+docker build -t my-ocr-service:v1.0 .
+
+docker-compose up -d --force-recreate
+
+docker save -o ocr-server-v1.0.tar my-ocr-service:v1.0
+
+scp -i ~/.ssh/39.98.44.136.pem ocr-server-v1.0.tar root@39.98.44.136:/root/docker-app/ocr
+
+#39.98.44.136 服务器
+docker load -i ocr-server-v1.0.tar
+
+docker-compose up -d --force-recreate
+
+docker logs -f ocr-server
+
+# 测试
+curl -X POST -F "file=@/root/ocr-service/test_resume.pdf" http://127.0.0.1:9000/ocr
+
+# 测试
+curl -X POST -H "Content-Type: application/json" -d '{"url": "https://qd.zhaopinzao8dian.com/file/test_resume.pdf"}' http://127.0.0.1:9000/ocr/url
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+fastapi==0.95.2
+uvicorn==0.22.0
+python-multipart==0.0.6
+rapidocr_onnxruntime==1.3.8
+numpy
+Pillow
+pdf2image
+openpyxl
+python-pptx
+requests
+xlrd>=2.0.1
--- a/yml/docker-compose.yml
+++ b/yml/docker-compose.yml
@@ -0,0 +1,25 @@
+version: '3.8'
+
+services:
+  ocr-server:
+    image: my-ocr-service:v1.0
+
+    container_name: ocr-server
+    restart: always
+    ports:
+      - "9000:9000"
+    environment:
+      - TZ=Asia/Shanghai
+
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "50m"
+        max-file: "3"
+
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:9000/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s