乌班图 部署 Mineru 本地解析

📅 2026/6/26 8:39:15
乌班图 部署 Mineru 本地解析
1.1 完整树形图文件目录deploy/ └── roof-mineru/ # MinerU OCR 服务部署根目录 ├── roof_mineru/ # Python 服务包 │ ├── __init__.py # 包初始化 │ ├── app.py # FastAPI 入口提供 /parse /task /result /health │ ├── config.py # 环境变量、目录、日志、默认参数 │ ├── file_utils.py # 文件名清理、Markdown 读取、过期目录清理 │ ├── mineru_runner.py # MinerU CLI 调用与输出解析 │ ├── schemas.py # API 响应与任务模型 │ └── task_store.py # 任务内存状态与 result.json 持久化 ├── docker-compose.yml # Docker Compose 配置默认 GPU 版 ├── Dockerfile.cpu # CPU/通用镜像方案 ├── Dockerfile.gpu # GPU/CUDA 11.8 镜像方案 ├── requirements.txt # Python 依赖列表 ├── start.sh # 容器启动脚本 ├── uploads/ # 上传文件临时目录挂载 ├── output/ # MinerU 输出目录挂载 ├── logs/ # 服务日志目录挂载 ├── models/ # 模型与框架缓存持久化目录挂载 └── cache/ # /root/.cache 持久化目录挂载1.2 文件说明文件说明roof_mineru/app.pyFastAPI 入口文件定义 REST API 接口roof_mineru/config.py配置类读取环境变量管理目录和参数roof_mineru/mineru_runner.py调用 MinerU CLI 命令执行文档解析roof_mineru/task_store.py任务状态管理支持内存和文件持久化docker-compose.ymlDocker Compose 部署配置默认使用 GPU 镜像Dockerfile.gpuGPU 版镜像构建文件基于 CUDA 11.8Dockerfile.cpuCPU 版镜像构建文件基于 Python 3.11start.sh容器启动脚本包含 GPU 检测和服务启动requirements.txtPython 依赖fastapi、uvicorn、mineru、paddleocr1.3 文件内容requirements.txtfastapi0.115.6 uvicorn[standard]0.34.0 python-multipart0.0.20 pydantic2.10.5 # MinerU 官方包。包含表格识别功能 mineru[pipeline]2.1.11 # 表格图片 OCR 识别与 paddlepaddle-gpu 2.7 兼容 paddleocr2.9.1docker-compose.ymlservices: roof-mineru: build: context: . dockerfile: Dockerfile.gpu image: roof-mineru:gpu container_name: roof-mineru restart: unless-stopped deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [ gpu ] shm_size: 4gb ports: - 8001:8001 environment: TZ: Asia/Shanghai NVIDIA_VISIBLE_DEVICES: all NVIDIA_DRIVER_CAPABILITIES: compute,utility LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs MINERU_MODEL_SOURCE: modelscope TRANSFORMERS_OFFLINE: 1 HF_HUB_OFFLINE: 1 MODEL_SCOPE_API: https://www.modelscope.cn HF_ENDPOINT: https://hf-mirror.com MINERU_HOST: 0.0.0.0 MINERU_PORT: 8001 MINERU_UPLOAD_DIR: /app/uploads MINERU_OUTPUT_DIR: /app/output MINERU_LOG_DIR: /app/logs MODELSCOPE_CACHE: /app/models/modelscope HF_HOME: /app/models/huggingface TRANSFORMERS_CACHE: /app/models/huggingface/transformers PADDLE_HOME: /app/models/paddle MINERU_DEFAULT_LANG: ch MINERU_DEFAULT_BACKEND: pipeline MINERU_DEFAULT_PARSE_METHOD: auto MINERU_MAX_UPLOAD_MB: 500 MINERU_MAX_WORKERS: 2 MINERU_TASK_TTL_HOURS: 24 volumes: - ./uploads:/app/uploads - ./output:/app/output - ./logs:/app/logs - ./models:/app/models - ./cache/pip:/root/.cache/pip - ./cache/huggingface:/root/.cache/huggingface - ./roof_mineru:/app/roof_mineru - ./start.sh:/app/start.sh healthcheck: test: [CMD, curl, -f, http://127.0.0.1:8001/health] interval: 30s timeout: 10s retries: 3 start_period: 120s logging: driver: json-file options: max-size: 100m max-file: 10 ulimits: nofile: soft: 65535 hard: 65535start.sh#!/usr/bin/env sh set -eu : ${MINERU_HOST:0.0.0.0} : ${MINERU_PORT:8001} : ${MINERU_UPLOAD_DIR:/app/uploads} : ${MINERU_OUTPUT_DIR:/app/output} : ${MINERU_LOG_DIR:/app/logs} mkdir -p $MINERU_UPLOAD_DIR $MINERU_OUTPUT_DIR $MINERU_LOG_DIR echo GPU 检测 if nvidia-smi /dev/null 21; then echo NVIDIA GPU 可用 nvidia-smi --query-gpuname,memory.total --formatcsv,noheader | head -1 echo 测试 Paddle 能否使用 GPU... if python3 -c import paddle; paddle.set_device(gpu); print(Paddle GPU 可用) /dev/null 21; then echo Paddle GPU 可用使用 GPU 模式 else echo Paddle GPU 初始化警告cuDNN但继续尝试使用 GPU fi else echo NVIDIA GPU 不可用使用 CPU 模式 export CUDA_VISIBLE_DEVICES fi echo echo Starting roof-mineru echo host$MINERU_HOST echo port$MINERU_PORT echo upload_dir$MINERU_UPLOAD_DIR echo output_dir$MINERU_OUTPUT_DIR echo log_dir$MINERU_LOG_DIR echo backend${MINERU_DEFAULT_BACKEND:-pipeline} echo parse_method${MINERU_DEFAULT_PARSE_METHOD:-auto} exec uvicorn roof_mineru.app:app --host $MINERU_HOST --port $MINERU_PORTDockerfile.gpu# syntaxdocker/dockerfile:1 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 ENV DEBIAN_FRONTENDnoninteractive \ PYTHONUNBUFFERED1 \ TZAsia/Shanghai \ PIP_INDEX_URLhttps://mirrors.aliyun.com/pypi/simple/ \ PIP_TRUSTED_HOSTmirrors.aliyun.com WORKDIR /app # 系统依赖阿里云 apt 镜像 BuildKit 缓存 RUN --mounttypecache,target/var/cache/apt,sharinglocked \ --mounttypecache,target/var/lib/apt,sharinglocked \ sed -i s|http://archive.ubuntu.com|https://mirrors.aliyun.com|g; \ s|http://security.ubuntu.com|https://mirrors.aliyun.com|g /etc/apt/sources.list \ apt-get update \ apt-get install -y --no-install-recommends \ python3 python3-pip python3-dev \ curl wget git tzdata \ libglib2.0-0 libgl1 libgomp1 \ libsm6 libxext6 libxrender1 \ libreoffice-writer libreoffice-core \ fonts-noto-cjk \ fonts-wqy-microhei \ fonts-wqy-zenhei \ ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ echo Asia/Shanghai /etc/timezone # PyTorch (CUDA 11.8 版本) RUN --mounttypecache,target/root/.cache/pip \ pip3 install --upgrade pip setuptools wheel \ pip3 install \ torch torchvision torchaudio \ --index-url https://download.pytorch.org/whl/cu118 # PaddlePaddle GPU (2.6.2 版本与 CUDA 11.8 兼容) RUN --mounttypecache,target/root/.cache/pip \ pip3 install \ paddlepaddle-gpu2.6.2 \ -i https://mirrors.aliyun.com/pypi/simple/ \ -f https://www.paddlepaddle.org.cn/whl/linux/gpu/cuda118.html # MinerU 其他依赖一次安装减少层数 COPY requirements.txt /app/ RUN --mounttypecache,target/root/.cache/pip \ pip3 install \ mineru[pipeline] \ opencv-python \ pillow numpy shapely scikit-image \ pip3 install -r /app/requirements.txt COPY roof_mineru /app/roof_mineru COPY start.sh /app/start.sh RUN chmod x /app/start.sh \ mkdir -p /app/uploads /app/output /app/logs /app/models /root/.cache CMD [/bin/bash, /app/start.sh]