This commit is contained in:
2026-01-12 22:49:25 +08:00
commit 48dbaeacd5
8 changed files with 584 additions and 0 deletions

28
Dockerfile Normal file
View File

@@ -0,0 +1,28 @@
FROM python:3.10-slim
WORKDIR /app
# 1. 替换阿里源加速
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
# 2. 安装系统依赖
# 增加 libgl1 和 libglib2.0-0 解决 cv2 缺库问题
RUN apt-get update && \
apt-get install -y --no-install-recommends libgomp1 poppler-utils catdoc libgl1 libglib2.0-0 && \
rm -rf /var/lib/apt/lists/*
# 3. 安装 Python 库
RUN pip install --no-cache-dir \
fastapi uvicorn websockets \
rapidocr-onnxruntime \
numpy requests \
python-pptx openpyxl xlrd \
pdf2image Pillow python-multipart
# 4. 复制代码
COPY . .
EXPOSE 9000
# 确保这里是 app.py
CMD ["python", "app.py"]

231
app.py Normal file
View File

@@ -0,0 +1,231 @@
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Body
from rapidocr_onnxruntime import RapidOCR
from pdf2image import convert_from_bytes
from PIL import Image
import numpy as np
import io
import time
import logging
import requests
import os
import tempfile
import subprocess
# 新增库引入
from openpyxl import load_workbook
from pptx import Presentation
import xlrd # 用于处理 .xls
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="OCR Service")
# 1. 全局初始化 OCR 模型
try:
logger.info("正在加载OCR模型...")
ocr_engine = RapidOCR()
logger.info("OCR模型加载完成")
except Exception as e:
logger.error(f"模型加载失败: {e}")
raise e
# --- 核心处理逻辑函数 ---
def ocr_single_image(img_np):
"""辅助函数:处理单张图片的 Numpy 数组并返回文本"""
try:
result, _ = ocr_engine(img_np)
txt_res = []
if result:
for line in result:
txt_res.append(line[1])
return "\n".join(txt_res)
except Exception as e:
logger.error(f"单图识别出错: {e}")
return ""
def extract_from_excel(file_bytes):
"""解析 Excel (.xlsx) 文本"""
try:
wb = load_workbook(io.BytesIO(file_bytes), data_only=True)
texts = []
for sheet in wb.worksheets:
texts.append(f"--- Sheet: {sheet.title} ---")
for row in sheet.iter_rows(values_only=True):
row_text = " ".join([str(cell) for cell in row if cell is not None])
if row_text.strip():
texts.append(row_text)
return "\n".join(texts)
except Exception as e:
logger.error(f"Excel(.xlsx)解析失败: {e}")
raise HTTPException(status_code=400, detail=f"xlsx解析失败: {str(e)}")
def extract_from_xls(file_bytes):
"""解析旧版 Excel (.xls) 文本"""
try:
# xlrd 支持直接从内存读取
wb = xlrd.open_workbook(file_contents=file_bytes)
texts = []
for sheet in wb.sheets():
texts.append(f"--- Sheet: {sheet.name} ---")
for row_idx in range(sheet.nrows):
row = sheet.row(row_idx)
# xlrd 的 cell 对象需要取 .value
row_text = " ".join([str(c.value) for c in row if c.value not in ('', None)])
if row_text.strip():
texts.append(row_text)
return "\n".join(texts)
except Exception as e:
logger.error(f"Excel(.xls)解析失败: {e}")
raise HTTPException(status_code=400, detail=f"xls解析失败: {str(e)}")
def extract_from_ppt(file_bytes):
"""解析 PPT (.pptx) 文本"""
try:
prs = Presentation(io.BytesIO(file_bytes))
texts = []
for i, slide in enumerate(prs.slides):
page_content = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
page_content.append(shape.text)
if page_content:
texts.append(f"--- Slide {i+1} ---")
texts.append("\n".join(page_content))
return "\n\n".join(texts)
except Exception as e:
logger.error(f"PPT(.pptx)解析失败: {e}")
raise HTTPException(status_code=400, detail="pptx解析失败")
def extract_from_ppt_legacy(file_bytes):
"""解析旧版 PPT (.ppt) 文本"""
# .ppt 是二进制 OLE 格式,纯 Python 库支持极差
# 这里使用系统级工具 catppt (来自 catdoc 包) 进行提取
temp_file = None
try:
# 创建临时文件,因为 catppt 需要文件路径
with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as tmp:
tmp.write(file_bytes)
temp_file = tmp.name
# 调用系统命令 catppt
# -d utf-8 尝试强制输出 utf-8 (视 catdoc 版本而定,通常默认即可)
process = subprocess.Popen(['catppt', temp_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if process.returncode != 0:
logger.error(f"catppt error: {stderr.decode()}")
raise Exception("解析进程异常退出")
# 尝试解码
return stdout.decode('utf-8', errors='ignore')
except Exception as e:
logger.error(f"PPT(.ppt)解析失败: {e}")
raise HTTPException(status_code=400, detail="ppt解析失败 (请检查文件是否损坏)")
finally:
# 清理临时文件
if temp_file and os.path.exists(temp_file):
os.remove(temp_file)
def process_file_bytes(file_bytes, filename):
"""通用文件流处理入口"""
filename = filename.lower()
# 1. PDF 处理
if filename.endswith('.pdf'):
try:
images = convert_from_bytes(file_bytes, dpi=200, fmt='jpeg')
texts = []
for i, img in enumerate(images):
img_np = np.array(img)
texts.append(f"--- Page {i+1} ---\n" + ocr_single_image(img_np))
return "\n\n".join(texts)
except Exception as e:
raise HTTPException(status_code=500, detail="PDF解析失败")
# 2. Excel 处理
elif filename.endswith('.xlsx'):
return extract_from_excel(file_bytes)
elif filename.endswith('.xls'):
return extract_from_xls(file_bytes)
# 3. PPT 处理
elif filename.endswith('.pptx'):
return extract_from_ppt(file_bytes)
elif filename.endswith('.ppt'):
return extract_from_ppt_legacy(file_bytes)
# 4. 图片处理
else:
try:
img = Image.open(io.BytesIO(file_bytes))
img_np = np.array(img)
return ocr_single_image(img_np)
except Exception:
raise HTTPException(status_code=400, detail=f"不支持的文件格式: {filename}")
# --- 接口定义 ---
@app.get("/health")
def health_check():
return {"status": "ok"}
@app.post("/ocr")
async def recognize_file(file: UploadFile = File(...)):
"""
文件上传接口
支持: 图片, PDF, Excel(.xlsx/.xls), PPT(.pptx/.ppt)
"""
start_time = time.time()
try:
content = await file.read()
result_text = process_file_bytes(content, file.filename)
cost = (time.time() - start_time) * 1000
return {"code": 200, "data": result_text, "cost_time_ms": cost}
except HTTPException as he:
raise he
except Exception as e:
logger.error(f"处理异常: {e}")
return {"code": 500, "msg": str(e)}
@app.post("/ocr/url")
async def recognize_url(url: str = Body(..., embed=True)):
"""
URL 识别接口
Body参数: {"url": "http://example.com/file.png"}
"""
start_time = time.time()
try:
logger.info(f"正在下载文件: {url}")
resp = requests.get(url, timeout=15)
if resp.status_code != 200:
raise HTTPException(status_code=400, detail=f"下载失败,状态码: {resp.status_code}")
filename = os.path.basename(url.split("?")[0])
if not filename or "." not in filename:
content_type = resp.headers.get('Content-Type', '')
if 'pdf' in content_type: filename = 'temp.pdf'
elif 'sheet' in content_type or 'excel' in content_type:
# 简单判断,这里默认给 xlsx如果是 xls 可能需要更复杂的 magic number 判断
filename = 'temp.xlsx'
elif 'presentation' in content_type: filename = 'temp.pptx'
else: filename = 'temp.jpg'
result_text = process_file_bytes(resp.content, filename)
cost = (time.time() - start_time) * 1000
return {"code": 200, "data": result_text, "cost_time_ms": cost}
except HTTPException as he:
raise he
except Exception as e:
logger.error(f"URL处理异常: {e}")
return {"code": 500, "msg": str(e)}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=9000)

36
docker-compose.yml Normal file
View File

@@ -0,0 +1,36 @@
version: '3.8'
services:
ocr-server:
# 方式A: 如果要在当前目录直接构建镜像,保留 build (推荐)
build: ""
# 方式B: 如果使用已经打好的镜像,注释掉 build打开 image
# image: my-ocr-service
container_name: ocr-server
restart: always
ports:
- "9000:9000"
environment:
- TZ=Asia/Shanghai
# 如果将来有其他配置,可以在这里添加
# - WORKERS=4
# --- 日志管理配置 ---
# 默认情况下 docker logs 会无限增长,这里限制大小
logging:
driver: "json-file"
options:
max-size: "50m" # 单个日志文件最大 50MB
max-file: "3" # 最多保留 3 个日志文件 (轮转)
# --- 健康检查配置 ---
# 每隔 30秒 访问一次 /health 接口
healthcheck:
# 需要在 Dockerfile 中安装 curl
test: ["CMD-SHELL", "curl -f http://localhost:9000/health || exit 1"]
interval: 30s # 检查间隔
timeout: 10s # 超时时间
retries: 3 # 失败重试次数超过3次标记为 unhealthy
start_period: 10s # 启动后前10秒不计入检查给模型加载预留时间

BIN
example/test_resume.pdf Normal file

Binary file not shown.

229
ocr-interface_doc.html Normal file
View File

@@ -0,0 +1,229 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OCR 服务接口文档</title>
<style>
:root {
--primary-color: #2c3e50;
--accent-color: #3498db;
--bg-color: #f8f9fa;
--border-color: #e9ecef;
--code-bg: #f1f3f5;
}
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 1000px;
margin: 0 auto;
padding: 20px;
background-color: #fff;
}
h1, h2, h3 {
color: var(--primary-color);
border-bottom: 2px solid var(--border-color);
padding-bottom: 10px;
margin-top: 30px;
}
h1 { border-bottom: none; font-size: 2.5em; text-align: center; margin-bottom: 40px; }
.endpoint {
background: #fff;
border: 1px solid var(--border-color);
border-radius: 8px;
margin-bottom: 30px;
padding: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
}
.method {
display: inline-block;
padding: 4px 8px;
border-radius: 4px;
color: #fff;
font-weight: bold;
font-size: 0.9em;
margin-right: 10px;
}
.post { background-color: #2ecc71; }
.url { font-family: monospace; font-size: 1.1em; color: var(--primary-color); }
table {
width: 100%;
border-collapse: collapse;
margin: 15px 0;
}
th, td {
border: 1px solid var(--border-color);
padding: 10px;
text-align: left;
}
th { background-color: var(--bg-color); font-weight: 600; }
code, pre {
font-family: Consolas, Monaco, "Andale Mono", monospace;
background-color: var(--code-bg);
border-radius: 4px;
}
code { padding: 2px 4px; color: #e83e8c; }
pre {
padding: 15px;
overflow-x: auto;
border: 1px solid var(--border-color);
}
.note {
background-color: #e3f2fd;
border-left: 5px solid #2196f3;
padding: 15px;
margin: 20px 0;
}
.warning {
background-color: #fff3cd;
border-left: 5px solid #ffc107;
padding: 15px;
margin: 20px 0;
}
.url-box {
display: inline-block;
text-align: left;
background: #f8f9fa;
padding: 15px 20px;
border: 1px solid #e9ecef;
border-radius: 8px;
margin: 10px 0;
}
.url-row {
margin-bottom: 5px;
}
.url-row:last-child {
margin-bottom: 0;
}
.url-label {
font-weight: 600;
color: #2c3e50;
display: inline-block;
width: 110px;
}
</style>
</head>
<body>
<h1>OCR 服务接口文档</h1>
<div style="text-align: center; margin-bottom: 40px;">
<p><strong>版本:</strong> 1.1.1 (Update)</p>
<div class="url-box">
<div class="url-row">
<span class="url-label">外网 Base URL:</span>
<code>https://qd.zhaopinzao8dian.com/ocr-api</code>
</div>
<div class="url-row">
<span class="url-label">内网 Base URL:</span>
<code>http://127.0.0.1:9001</code>
</div>
</div>
<p style="margin-top: 15px;"><strong>支持格式:</strong> 图片 (JPG/PNG), PDF, Excel (.xlsx / .xls), PPT (.pptx / .ppt)</p>
</div>
<section>
<h2>通用响应结构</h2>
<p>所有接口请求成功均返回如下 JSON 结构:</p>
<pre>{
"code": 200, // 业务状态码 (200成功, 500错误)
"data": "识别出的文本内容...", // 具体的文本结果
"cost_time_ms": 150.5, // 处理耗时 (毫秒)
"msg": "" // 错误信息 (仅出错时存在)
}</pre>
</section>
<section class="endpoint">
<h3>1. 文件上传识别</h3>
<div>
<span class="method post">POST</span>
<span class="url">/ocr</span>
</div>
<p>通过 `multipart/form-data` 表单上传本地文件进行识别。</p>
<h4>请求参数 (Form-Data)</h4>
<table>
<thead>
<tr>
<th width="20%">参数名</th>
<th width="15%">类型</th>
<th width="15%">必填</th>
<th>说明</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>file</code></td>
<td>File</td>
<td></td>
<td>二进制文件流。<br>支持:图片, PDF, Excel(.xlsx/.xls), PPT(.pptx/.ppt)</td>
</tr>
</tbody>
</table>
<h4>CURL 示例 (内网)</h4>
<pre>curl -X POST -F "file=@/path/to/old_data.xls" http://127.0.0.1:9001/ocr</pre>
<h4>响应示例</h4>
<pre>{
"code": 200,
"data": "--- Sheet: Sheet1 ---\n这里是Excel中的文本内容...",
"cost_time_ms": 320.5
}</pre>
</section>
<section class="endpoint">
<h3>2. URL 网络文件识别</h3>
<div>
<span class="method post">POST</span>
<span class="url">/ocr/url</span>
</div>
<p>提交一个可访问的文件链接,服务自动下载并识别。</p>
<h4>请求头 (Headers)</h4>
<ul>
<li><code>Content-Type: application/json</code></li>
</ul>
<h4>请求参数 (JSON Body)</h4>
<table>
<thead>
<tr>
<th width="20%">参数名</th>
<th width="15%">类型</th>
<th width="15%">必填</th>
<th>说明</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>url</code></td>
<td>String</td>
<td></td>
<td>完整的下载链接 (需以 http/https 开头)</td>
</tr>
</tbody>
</table>
<h4>CURL 示例 (内网)</h4>
<pre>curl -X POST -H "Content-Type: application/json" \
-d '{"url": "https://example.com/legacy_slides.ppt"}' \
http://127.0.0.1:9001/ocr/url</pre>
</section>
<div class="note">
<div class="warning">
<h3>⚠️ 注意事项</h3>
<ul>
<li><strong>PDF 处理:</strong> PDF 会先转换为图片再进行 OCR页数越多耗时越长建议对大文件进行异步处理或分片上传。</li>
<li><strong>超时限制:</strong> URL 下载接口默认超时时间为 15 秒,请确保下载链接稳定。</li>
</ul>
</div>
</body>
</html>

24
ream_me.txt Normal file
View File

@@ -0,0 +1,24 @@
# 停止旧服务
docker-compose down
# 重新构建 (注意末尾有个点)
docker build -t my-ocr-service:v1.0 .
docker-compose up -d --force-recreate
docker save -o ocr-server-v1.0.tar my-ocr-service:v1.0
scp -i ~/.ssh/39.98.44.136.pem ocr-server-v1.0.tar root@39.98.44.136:/root/docker-app/ocr
#39.98.44.136 服务器
docker load -i ocr-server-v1.0.tar
docker-compose up -d --force-recreate
docker logs -f ocr-server
# 测试
curl -X POST -F "file=@/root/ocr-service/test_resume.pdf" http://127.0.0.1:9000/ocr
# 测试
curl -X POST -H "Content-Type: application/json" -d '{"url": "https://qd.zhaopinzao8dian.com/file/test_resume.pdf"}' http://127.0.0.1:9000/ocr/url

11
requirements.txt Normal file
View File

@@ -0,0 +1,11 @@
fastapi==0.95.2
uvicorn==0.22.0
python-multipart==0.0.6
rapidocr_onnxruntime==1.3.8
numpy
Pillow
pdf2image
openpyxl
python-pptx
requests
xlrd>=2.0.1

25
yml/docker-compose.yml Normal file
View File

@@ -0,0 +1,25 @@
version: '3.8'
services:
ocr-server:
image: my-ocr-service:v1.0
container_name: ocr-server
restart: always
ports:
- "9000:9000"
environment:
- TZ=Asia/Shanghai
logging:
driver: "json-file"
options:
max-size: "50m"
max-file: "3"
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9000/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s