Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -20,19 +20,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
|
| 20 |
model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
|
| 21 |
model = model.eval().cuda()
|
| 22 |
|
| 23 |
-
|
| 24 |
-
"Default": {"base_size": 1024, "image_size": 768, "crop_mode": True},
|
| 25 |
-
"Quality": {"base_size": 1280, "image_size": 960, "crop_mode": True},
|
| 26 |
-
"Fast": {"base_size": 1024, "image_size": 640, "crop_mode": True},
|
| 27 |
-
"No Crop": {"base_size": 1024, "image_size": 768, "crop_mode": False},
|
| 28 |
-
"Small": {"base_size": 768, "image_size": 512, "crop_mode": False},
|
| 29 |
-
}
|
| 30 |
|
| 31 |
TASK_PROMPTS = {
|
| 32 |
"π Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
|
| 33 |
"π Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
|
| 34 |
-
"πΌοΈ OCR Image": {"prompt": "<image>\n<|grounding|>OCR this image.", "has_grounding": True},
|
| 35 |
-
"π Parse Figure": {"prompt": "<image>\nParse the figure.", "has_grounding": False},
|
| 36 |
"π Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
|
| 37 |
"π Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
|
| 38 |
"βοΈ Custom": {"prompt": "", "has_grounding": False}
|
|
@@ -114,7 +106,7 @@ def embed_images(markdown, crops):
|
|
| 114 |
return markdown
|
| 115 |
|
| 116 |
@spaces.GPU(duration=90)
|
| 117 |
-
def process_image(image,
|
| 118 |
if image is None:
|
| 119 |
return "Error: Upload an image", "", "", None, []
|
| 120 |
if task in ["βοΈ Custom", "π Locate"] and not custom_prompt.strip():
|
|
@@ -124,7 +116,7 @@ def process_image(image, mode, task, custom_prompt):
|
|
| 124 |
image = image.convert('RGB')
|
| 125 |
image = ImageOps.exif_transpose(image)
|
| 126 |
|
| 127 |
-
config =
|
| 128 |
|
| 129 |
if task == "βοΈ Custom":
|
| 130 |
prompt = f"<image>\n{custom_prompt.strip()}"
|
|
@@ -181,7 +173,7 @@ def process_image(image, mode, task, custom_prompt):
|
|
| 181 |
return cleaned, markdown, result, img_out, crops
|
| 182 |
|
| 183 |
@spaces.GPU(duration=90)
|
| 184 |
-
def process_pdf(path,
|
| 185 |
doc = fitz.open(path)
|
| 186 |
total_pages = len(doc)
|
| 187 |
if page_num < 1 or page_num > total_pages:
|
|
@@ -192,15 +184,15 @@ def process_pdf(path, mode, task, custom_prompt, page_num):
|
|
| 192 |
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 193 |
doc.close()
|
| 194 |
|
| 195 |
-
return process_image(img,
|
| 196 |
|
| 197 |
-
def process_file(path,
|
| 198 |
if not path:
|
| 199 |
return "Error: Upload a file", "", "", None, []
|
| 200 |
if path.lower().endswith('.pdf'):
|
| 201 |
-
return process_pdf(path,
|
| 202 |
else:
|
| 203 |
-
return process_image(Image.open(path),
|
| 204 |
|
| 205 |
def toggle_prompt(task):
|
| 206 |
if task == "βοΈ Custom":
|
|
@@ -249,7 +241,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 249 |
gr.Markdown("""
|
| 250 |
# π DeepSeek-OCR-2 Demo
|
| 251 |
**Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
|
| 252 |
-
|
| 253 |
**It's powered by DeepEncoder v2. It achieves 91.09% on OmniDocBench (+3.73% over v1).**
|
| 254 |
|
| 255 |
**Hope this tool was helpful! If so, a quick like β€οΈ would mean a lot :)**
|
|
@@ -260,7 +251,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 260 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
| 261 |
input_img = gr.Image(label="Input Image", type="pil", height=300)
|
| 262 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
| 263 |
-
mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Default", label="Mode")
|
| 264 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="π Markdown", label="Task")
|
| 265 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 266 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
|
@@ -280,27 +270,18 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 280 |
|
| 281 |
gr.Examples(
|
| 282 |
examples=[
|
| 283 |
-
["examples/ocr.jpg", "
|
| 284 |
-
["examples/reachy-mini.jpg", "
|
| 285 |
],
|
| 286 |
-
inputs=[input_img,
|
| 287 |
cache_examples=False
|
| 288 |
)
|
| 289 |
|
| 290 |
with gr.Accordion("βΉοΈ Info", open=False):
|
| 291 |
gr.Markdown("""
|
| 292 |
-
### Modes
|
| 293 |
-
- **Default**: 1024 base + 768 tiles with cropping - Recommended for most use cases
|
| 294 |
-
- **Quality**: 1280 base + 960 tiles with cropping - Higher quality, slower
|
| 295 |
-
- **Fast**: 1024 base + 640 tiles with cropping - Faster processing
|
| 296 |
-
- **No Crop**: 1024 base + 768 tiles without cropping - Single image processing
|
| 297 |
-
- **Small**: 768 base + 512 tiles without cropping - Fastest, lower quality
|
| 298 |
-
|
| 299 |
### Tasks
|
| 300 |
- **Markdown**: Convert document to structured markdown with layout detection (grounding β
)
|
| 301 |
- **Free OCR**: Simple text extraction without layout
|
| 302 |
-
- **OCR Image**: OCR for general images with grounding (grounding β
)
|
| 303 |
-
- **Parse Figure**: Parse figures and charts in documents
|
| 304 |
- **Locate**: Find and highlight specific text/elements in image (grounding β
)
|
| 305 |
- **Describe**: General image description
|
| 306 |
- **Custom**: Your own prompt (add `<|grounding|>` for bounding boxes)
|
|
@@ -318,14 +299,14 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 318 |
task.change(toggle_prompt, [task], [prompt])
|
| 319 |
task.change(select_boxes, [task], [tabs])
|
| 320 |
|
| 321 |
-
def run(image, file_path,
|
| 322 |
if file_path:
|
| 323 |
-
return process_file(file_path,
|
| 324 |
if image is not None:
|
| 325 |
-
return process_image(image,
|
| 326 |
return "Error: Upload a file or image", "", "", None, []
|
| 327 |
|
| 328 |
-
submit_event = btn.click(run, [input_img, file_in,
|
| 329 |
[text_out, md_out, raw_out, img_out, gallery])
|
| 330 |
submit_event.then(select_boxes, [task], [tabs])
|
| 331 |
|
|
|
|
| 20 |
model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
|
| 21 |
model = model.eval().cuda()
|
| 22 |
|
| 23 |
+
DEFAULT_CONFIG = {"base_size": 1024, "image_size": 768, "crop_mode": True}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
TASK_PROMPTS = {
|
| 26 |
"π Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
|
| 27 |
"π Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
|
|
|
|
|
|
|
| 28 |
"π Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
|
| 29 |
"π Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
|
| 30 |
"βοΈ Custom": {"prompt": "", "has_grounding": False}
|
|
|
|
| 106 |
return markdown
|
| 107 |
|
| 108 |
@spaces.GPU(duration=90)
|
| 109 |
+
def process_image(image, task, custom_prompt):
|
| 110 |
if image is None:
|
| 111 |
return "Error: Upload an image", "", "", None, []
|
| 112 |
if task in ["βοΈ Custom", "π Locate"] and not custom_prompt.strip():
|
|
|
|
| 116 |
image = image.convert('RGB')
|
| 117 |
image = ImageOps.exif_transpose(image)
|
| 118 |
|
| 119 |
+
config = DEFAULT_CONFIG
|
| 120 |
|
| 121 |
if task == "βοΈ Custom":
|
| 122 |
prompt = f"<image>\n{custom_prompt.strip()}"
|
|
|
|
| 173 |
return cleaned, markdown, result, img_out, crops
|
| 174 |
|
| 175 |
@spaces.GPU(duration=90)
|
| 176 |
+
def process_pdf(path, task, custom_prompt, page_num):
|
| 177 |
doc = fitz.open(path)
|
| 178 |
total_pages = len(doc)
|
| 179 |
if page_num < 1 or page_num > total_pages:
|
|
|
|
| 184 |
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 185 |
doc.close()
|
| 186 |
|
| 187 |
+
return process_image(img, task, custom_prompt)
|
| 188 |
|
| 189 |
+
def process_file(path, task, custom_prompt, page_num):
|
| 190 |
if not path:
|
| 191 |
return "Error: Upload a file", "", "", None, []
|
| 192 |
if path.lower().endswith('.pdf'):
|
| 193 |
+
return process_pdf(path, task, custom_prompt, page_num)
|
| 194 |
else:
|
| 195 |
+
return process_image(Image.open(path), task, custom_prompt)
|
| 196 |
|
| 197 |
def toggle_prompt(task):
|
| 198 |
if task == "βοΈ Custom":
|
|
|
|
| 241 |
gr.Markdown("""
|
| 242 |
# π DeepSeek-OCR-2 Demo
|
| 243 |
**Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
|
|
|
|
| 244 |
**It's powered by DeepEncoder v2. It achieves 91.09% on OmniDocBench (+3.73% over v1).**
|
| 245 |
|
| 246 |
**Hope this tool was helpful! If so, a quick like β€οΈ would mean a lot :)**
|
|
|
|
| 251 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
| 252 |
input_img = gr.Image(label="Input Image", type="pil", height=300)
|
| 253 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
|
|
|
| 254 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="π Markdown", label="Task")
|
| 255 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 256 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
|
|
|
| 270 |
|
| 271 |
gr.Examples(
|
| 272 |
examples=[
|
| 273 |
+
["examples/ocr.jpg", "π Markdown", ""],
|
| 274 |
+
["examples/reachy-mini.jpg", "π Locate", "Robot"]
|
| 275 |
],
|
| 276 |
+
inputs=[input_img, task, prompt],
|
| 277 |
cache_examples=False
|
| 278 |
)
|
| 279 |
|
| 280 |
with gr.Accordion("βΉοΈ Info", open=False):
|
| 281 |
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
### Tasks
|
| 283 |
- **Markdown**: Convert document to structured markdown with layout detection (grounding β
)
|
| 284 |
- **Free OCR**: Simple text extraction without layout
|
|
|
|
|
|
|
| 285 |
- **Locate**: Find and highlight specific text/elements in image (grounding β
)
|
| 286 |
- **Describe**: General image description
|
| 287 |
- **Custom**: Your own prompt (add `<|grounding|>` for bounding boxes)
|
|
|
|
| 299 |
task.change(toggle_prompt, [task], [prompt])
|
| 300 |
task.change(select_boxes, [task], [tabs])
|
| 301 |
|
| 302 |
+
def run(image, file_path, task, custom_prompt, page_num):
|
| 303 |
if file_path:
|
| 304 |
+
return process_file(file_path, task, custom_prompt, int(page_num))
|
| 305 |
if image is not None:
|
| 306 |
+
return process_image(image, task, custom_prompt)
|
| 307 |
return "Error: Upload a file or image", "", "", None, []
|
| 308 |
|
| 309 |
+
submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
|
| 310 |
[text_out, md_out, raw_out, img_out, gallery])
|
| 311 |
submit_event.then(select_boxes, [task], [tabs])
|
| 312 |
|