windows上安装nvidia的视觉定位模型LocateAnything环境

📅 2026/6/28 2:57:27
windows上安装nvidia的视觉定位模型LocateAnything环境
首先由于3B参数模型太大显存必须8GB否则安装后也是起不来或者很慢根本无法运行这个是前提。我电脑环境是windows10 x64RTX2070 8GB显存(推荐RTX30及其以上显卡)我成功安装环境是可以对照我这个环境安装Package Version ------------------------- ------------ accelerate 1.5.2 aiofiles 24.1.0 aiohappyeyeballs 2.6.2 aiohttp 3.14.1 aiosignal 1.4.0 altair 6.2.2 annotated-doc 0.0.4 annotated-types 0.7.0 anyio 4.14.1 attrs 26.1.0 av 17.1.0 bitarray 3.8.2 bitsandbytes 0.49.2 bitstring 4.4.0 blinker 1.9.0 brotli 1.2.0 cachetools 7.1.4 certifi 2026.6.17 charset-normalizer 3.4.7 click 8.4.2 colorama 0.4.6 contourpy 1.3.3 cycler 0.12.1 datasets 5.0.0 decord 0.6.0 deepspeed 0.16.5 dill 0.4.1 dotenv 0.9.9 ebmlite 3.4.1 einops 0.8.2 einops-exts 0.0.4 fastapi 0.135.1 ffmpy 1.0.0 filelock 3.29.4 filetype 1.2.0 fonttools 4.63.0 frozenlist 1.8.0 fsspec 2026.4.0 gitdb 4.0.12 GitPython 3.1.50 gradio 6.5.0 gradio_client 2.0.3 groovy 0.1.2 h11 0.16.0 hjson 3.1.0 httpcore 1.0.9 httptools 0.8.0 httpx 0.28.1 huggingface_hub 0.36.2 idna 3.18 ImageHash 4.3.2 ImageIO 2.37.3 itsdangerous 2.2.0 Jinja2 3.1.6 joblib 1.5.3 jsonschema 4.26.0 jsonschema-specifications 2025.9.1 kiwisolver 1.5.0 latex2mathml 3.81.0 lazy-loader 0.5 liger_kernel 0.3.1 linkify-it-py 2.1.0 lmdb 2.2.1 locate_anything 1.0 markdown-it-py 2.2.0 markdown2 2.5.5 MarkupSafe 3.0.3 matplotlib 3.11.0 mdit-py-plugins 0.3.3 mdurl 0.1.2 mpmath 1.3.0 msgpack 1.2.1 multidict 6.7.1 multiprocess 0.70.19 narwhals 2.22.1 networkx 3.6.1 ninja 1.13.0 numpy 1.26.4 nvidia-ml-py 13.610.43 opencv-python-headless 4.11.0.86 orjson 3.11.9 packaging 26.0 pandas 3.0.3 peft 0.12.0 pillow 12.2.0 pip 26.1.2 platformdirs 4.10.0 propcache 0.5.2 protobuf 7.35.1 psutil 7.2.2 py-cpuinfo 9.0.0 pyarrow 24.0.0 pydantic 2.7.1 pydantic_core 2.18.2 pydeck 0.9.2 pydub 0.25.1 Pygments 2.20.0 pyparsing 3.3.2 python-dateutil 2.9.0.post0 python-dotenv 1.2.2 python-multipart 0.0.32 pytz 2026.2 PyWavelets 1.9.0 PyYAML 6.0.3 referencing 0.37.0 regex 2026.5.9 requests 2.34.2 rich 15.0.0 rpds-py 2026.5.1 safehttpx 0.1.7 safetensors 0.8.0 scikit-image 0.26.0 scikit-learn 1.9.0 scipy 1.17.1 semantic-version 2.10.0 sentencepiece 0.2.0 sentry-sdk 2.63.0 setuptools 70.0.0 shellingham 1.5.4 shortuuid 1.0.13 six 1.17.0 smmap 5.0.3 sniffio 1.3.1 starlette 0.52.1 streamlit 1.58.0 streamlit-image-select 0.6.0 svgwrite 1.4.3 sympy 1.14.0 tenacity 9.1.4 threadpoolctl 3.6.0 tibs 0.5.7 tifffile 2026.3.3 timm 1.0.27 tokenizers 0.22.0 toml 0.10.2 tomlkit 0.13.3 torch 2.7.1cu118 torchvision 0.22.1cu118 tqdm 4.68.3 transformers 4.57.1 triton 3.1.0 typer 0.26.8 typing_extensions 4.15.0 typing-inspection 0.4.2 tzdata 2026.2 uc-micro-py 2.0.0 urllib3 2.7.0 uvicorn 0.49.0 wandb 0.28.0 watchdog 6.0.0 wavedrom 2.0.3.post3 websockets 16.0 wheel 0.47.0 xxhash 3.8.0 yarl 1.24.2注意如果你的显卡是RTX50请更换cuda12.8支持pytorch否则无法使用cuda安装步骤首先加入HF_ENDPOINThttps://hf-mirror.com到环境变量否则运行时候模型无法从huggingface下载conda create -n locateanything python3.11 -yconda activate locateanythingpip install torch2.7.1 torchvision0.22.1 torchaudio2.7.1 --index-url https://download.pytorch.org/whl/cu118pip install deepspeed-0.16.5-cp311-cp311-win_amd64.whl (需要提前下载好whl文件)pip install triton-3.1.0-cp311-cp311-win_amd64.whl(需要提前下载好whl文件)git clone https://github.com/NVlabs/Eagle.gitcd Eagle/Embodied打开文件pyproject.toml修改deepseed版本为0.16.5从源码安装pip install .最后重新更换模块版本gradio60.5.0和setuptools70.0.0和jinjia23.1.6写一个gradio界面import gradio as gr from PIL import Image, ImageDraw, ImageFont import re import warnings import os import numpy as np import torch import time from locateanything_worker import LocateAnythingWorker warnings.filterwarnings(ignore) # 优化配置 class Config: MODEL_PATH nvidia/LocateAnything-3B # 性能优化参数 MAX_IMAGE_SIZE 640 # 最大图像尺寸 CONFIDENCE_THRESHOLD 0.3 # 置信度阈值 MAX_DETECTIONS 20 # 最大检测数量 # 字体路径 FONT_PATHS [ C:/Windows/Fonts/simsun.ttc, C:/Windows/Fonts/msyh.ttc, C:/Windows/Fonts/simhei.ttf, /usr/share/fonts/truetype/wqy/wqy-microhei.ttc, ] # 模型加载 print( 正在加载模型...) start_time time.time() try: # 加载模型 worker LocateAnythingWorker(Config.MODEL_PATH) # 尝试转换为FP16加速 if torch.cuda.is_available(): print( 尝试启用FP16半精度...) try: if hasattr(worker, model): worker.model worker.model.half() print(✅ 模型已转换为FP16) elif hasattr(worker, vision_tower): worker.vision_tower worker.vision_tower.half() if hasattr(worker, llm): worker.llm worker.llm.half() print(✅ 模型组件已转换为FP16) except Exception as e: print(f⚠️ FP16转换失败: {e}使用FP32) load_time time.time() - start_time print(f✅ 模型加载完成耗时 {load_time:.2f}秒) # 显示显存使用 if torch.cuda.is_available(): allocated torch.cuda.memory_allocated() / 1024**3 reserved torch.cuda.memory_reserved() / 1024**3 print(f 显存使用: {allocated:.2f}GB / {reserved:.2f}GB) except Exception as e: print(f❌ 模型加载失败: {e}) print(请确保已安装: pip install githttps://github.com/NVIDIA/LocateAnything.git) exit(1) # 检测函数 def detect_objects(image, prompt): 检测图片中的目标 Args: image: PIL Image对象 prompt: 检测目标支持逗号分隔多个目标 Returns: 标注后的PIL Image对象 if image is None: return None # 保存原始尺寸用于坐标转换 orig_w, orig_h image.size # 缩放图像加速推理 processed_img image.copy() max_size Config.MAX_IMAGE_SIZE if max(orig_w, orig_h) max_size: ratio max_size / max(orig_w, orig_h) new_w int(orig_w * ratio) new_h int(orig_h * ratio) processed_img processed_img.resize((new_w, new_h), Image.Resampling.LANCZOS) print(f 图像缩放: {orig_w}x{orig_h} - {new_w}x{new_h}) w, h new_w, new_h else: w, h orig_w, orig_h # 处理提示词支持逗号分隔的多个目标 prompts [p.strip() for p in prompt.split(,) if p.strip()] if len(prompts) 1: print(f 检测多个目标: {prompts}) # 多个目标使用 detect 方法 try: result worker.detect(processed_img, prompts) except: # 如果detect失败尝试逐个检测 result {answer: } for p in prompts: try: r worker.detect(processed_img, [p]) result[answer] r.get(answer, ) except: pass else: # 单个目标使用 detect print(f 检测目标: {prompt}) result worker.detect(processed_img, [prompt]) answer result.get(answer, ) print(f 模型输出: {answer[:200]}...) # 打印部分输出用于调试 # 提取所有检测框 boxes re.findall(rbox(\d)(\d)(\d)(\d)/box, answer) if not boxes: print(⚠️ 未检测到目标) return image print(f✅ 检测到 {len(boxes)} 个目标) # 绘制检测框 draw ImageDraw.Draw(image) colors [#FF0000, #0066FF, #00CC00, #FF9900, #CC00FF, #FF0066, #00CCCC, #FFCC00, #66FF00, #FF66FF] # 加载字体 font None if not hasattr(detect_objects, _font): for path in Config.FONT_PATHS: if os.path.exists(path): try: font ImageFont.truetype(path, 20) break except: continue if font is None: font ImageFont.load_default() detect_objects._font font else: font detect_objects._font # 提取目标名称 # 尝试从answer中提取目标名称 names re.findall(rp([^])/p, answer) for i, box in enumerate(boxes[:Config.MAX_DETECTIONS]): x1, y1, x2, y2 map(int, box) # 坐标转换从缩放后的坐标映射回原始坐标 scale_x orig_w / w scale_y orig_h / h px1 int(x1 / 1000 * w * scale_x) py1 int(y1 / 1000 * h * scale_y) px2 int(x2 / 1000 * w * scale_x) py2 int(y2 / 1000 * h * scale_y) # 确保坐标在图像范围内 px1 max(0, min(px1, orig_w)) py1 max(0, min(py1, orig_h)) px2 max(0, min(px2, orig_w)) py2 max(0, min(py2, orig_h)) color colors[i % len(colors)] # 绘制矩形框 draw.rectangle([px1, py1, px2, py2], outlinecolor, width3) # 获取标签名称 if i len(names): label names[i][:20] # 限制长度 else: label f{prompt}_{i1} # 绘制标签背景和文字 try: bbox draw.textbbox((px1, py1-25), label, fontfont) text_width bbox[2] - bbox[0] text_height bbox[3] - bbox[1] # 绘制背景 draw.rectangle([px1, py1-25, px1text_width4, py1], fillcolor) # 绘制文字 draw.text((px12, py1-23), label, fillwhite, fontfont) except: # 如果字体不支持简单绘制 draw.text((px1, py1-20), label, fillcolor) return image def process_image(image, prompt): 处理上传的图片和提示词 if image is None: return None # 转换numpy数组为PIL Image if isinstance(image, np.ndarray): image Image.fromarray(image) # 复制图片避免修改原图 img_copy image.copy() # 执行检测 result_img detect_objects(img_copy, prompt) # 清理GPU缓存 if torch.cuda.is_available(): torch.cuda.empty_cache() return result_img # Gradio界面 with gr.Blocks(title目标检测标注工具, themegr.themes.Soft()) as demo: gr.Markdown( # 目标检测标注工具 ### ⚡ RTX 2070 8GB 优化版 - 使用 detect 方法 支持检测人物、车辆、动物、物体等基于LocateAnything模型 ) with gr.Row(): with gr.Column(scale1): input_image gr.Image( label 上传图片, typenumpy, height400 ) prompt_input gr.Textbox( label 请输入检测目标, placeholder例如person 或 person, car, dog多个用逗号分隔, valueperson, lines2 ) with gr.Row(): detect_btn gr.Button( 开始检测, variantprimary, sizelg) clear_btn gr.Button(️ 清空, variantsecondary) status_text gr.Textbox( label⏱️ 状态信息, value就绪, interactiveFalse ) # 示例提示词 gr.Examples( examples[ [person], [car], [dog], [cat], [person, car], [bicycle], [traffic light], [chair, table] ], inputs[prompt_input], label 示例提示词支持英文 ) with gr.Column(scale1): output_image gr.Image( label 标注结果, typepil, height400 ) with gr.Row(): with gr.Column(): gr.Markdown( ### 使用说明 1. 上传一张图片建议小于1024x1024 2. 输入要检测的目标支持英文多个用逗号分隔 3. 点击开始检测按钮 4. 等待推理完成约30秒-2分钟 ### ⚡ 性能优化 - **FP16半精度**: 显存占用减少50% - **图像缩放**: 自动缩放到640px - **detect方法**: 使用专用检测接口 - **显存清理**: 每次推理后自动清理 ### 支持功能 - 单目标检测person - 多目标检测person, car, dog - 场景文字检测特殊提示词 - 指代理解如the red car ) # 绑定事件 def process_with_status(image, prompt): if image is None: return None, ❌ 请上传图片 if not prompt or not prompt.strip(): return image, ❌ 请输入检测目标 start_time time.time() try: result process_image(image, prompt) elapsed time.time() - start_time if result is None: return image, ❌ 处理失败 return result, f✅ 完成耗时 {elapsed:.1f}秒 except Exception as e: import traceback error_msg traceback.format_exc() print(f❌ 错误: {error_msg}) return image, f❌ 错误: {str(e)[:100]} detect_btn.click( fnprocess_with_status, inputs[input_image, prompt_input], outputs[output_image, status_text] ) prompt_input.submit( fnprocess_with_status, inputs[input_image, prompt_input], outputs[output_image, status_text] ) clear_btn.click( fnlambda: (None, None, 已清空), inputs[], outputs[input_image, output_image, status_text] ) if __name__ __main__: print( * 60) print( 启动优化版目标检测工具) print( * 60) if torch.cuda.is_available(): print(f GPU: {torch.cuda.get_device_name(0)}) print(f 显存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB) print(f CUDA版本: {torch.version.cuda}) else: print( 使用CPU模式速度较慢) print(f 最大图像尺寸: {Config.MAX_IMAGE_SIZE}px) print( * 60) demo.launch( server_name0.0.0.0, server_port7860, shareFalse )运行效果