merterbak commited on
Commit
a92bce3
Β·
verified Β·
1 Parent(s): d0ad91a

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +15 -34
app.py CHANGED
@@ -20,19 +20,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
20
  model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
21
  model = model.eval().cuda()
22
 
23
- MODEL_CONFIGS = {
24
- "Default": {"base_size": 1024, "image_size": 768, "crop_mode": True},
25
- "Quality": {"base_size": 1280, "image_size": 960, "crop_mode": True},
26
- "Fast": {"base_size": 1024, "image_size": 640, "crop_mode": True},
27
- "No Crop": {"base_size": 1024, "image_size": 768, "crop_mode": False},
28
- "Small": {"base_size": 768, "image_size": 512, "crop_mode": False},
29
- }
30
 
31
  TASK_PROMPTS = {
32
  "πŸ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
33
  "πŸ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
34
- "πŸ–ΌοΈ OCR Image": {"prompt": "<image>\n<|grounding|>OCR this image.", "has_grounding": True},
35
- "πŸ“Š Parse Figure": {"prompt": "<image>\nParse the figure.", "has_grounding": False},
36
  "πŸ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
37
  "πŸ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
38
  "✏️ Custom": {"prompt": "", "has_grounding": False}
@@ -114,7 +106,7 @@ def embed_images(markdown, crops):
114
  return markdown
115
 
116
  @spaces.GPU(duration=90)
117
- def process_image(image, mode, task, custom_prompt):
118
  if image is None:
119
  return "Error: Upload an image", "", "", None, []
120
  if task in ["✏️ Custom", "πŸ“ Locate"] and not custom_prompt.strip():
@@ -124,7 +116,7 @@ def process_image(image, mode, task, custom_prompt):
124
  image = image.convert('RGB')
125
  image = ImageOps.exif_transpose(image)
126
 
127
- config = MODEL_CONFIGS[mode]
128
 
129
  if task == "✏️ Custom":
130
  prompt = f"<image>\n{custom_prompt.strip()}"
@@ -181,7 +173,7 @@ def process_image(image, mode, task, custom_prompt):
181
  return cleaned, markdown, result, img_out, crops
182
 
183
  @spaces.GPU(duration=90)
184
- def process_pdf(path, mode, task, custom_prompt, page_num):
185
  doc = fitz.open(path)
186
  total_pages = len(doc)
187
  if page_num < 1 or page_num > total_pages:
@@ -192,15 +184,15 @@ def process_pdf(path, mode, task, custom_prompt, page_num):
192
  img = Image.open(BytesIO(pix.tobytes("png")))
193
  doc.close()
194
 
195
- return process_image(img, mode, task, custom_prompt)
196
 
197
- def process_file(path, mode, task, custom_prompt, page_num):
198
  if not path:
199
  return "Error: Upload a file", "", "", None, []
200
  if path.lower().endswith('.pdf'):
201
- return process_pdf(path, mode, task, custom_prompt, page_num)
202
  else:
203
- return process_image(Image.open(path), mode, task, custom_prompt)
204
 
205
  def toggle_prompt(task):
206
  if task == "✏️ Custom":
@@ -249,7 +241,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
249
  gr.Markdown("""
250
  # πŸš€ DeepSeek-OCR-2 Demo
251
  **Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
252
-
253
  **It's powered by DeepEncoder v2. It achieves 91.09% on OmniDocBench (+3.73% over v1).**
254
 
255
  **Hope this tool was helpful! If so, a quick like ❀️ would mean a lot :)**
@@ -260,7 +251,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
260
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
261
  input_img = gr.Image(label="Input Image", type="pil", height=300)
262
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
263
- mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Default", label="Mode")
264
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
265
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
266
  btn = gr.Button("Extract", variant="primary", size="lg")
@@ -280,27 +270,18 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
280
 
281
  gr.Examples(
282
  examples=[
283
- ["examples/ocr.jpg", "Default", "πŸ“‹ Markdown", ""],
284
- ["examples/reachy-mini.jpg", "Default", "πŸ“ Locate", "Robot"]
285
  ],
286
- inputs=[input_img, mode, task, prompt],
287
  cache_examples=False
288
  )
289
 
290
  with gr.Accordion("ℹ️ Info", open=False):
291
  gr.Markdown("""
292
- ### Modes
293
- - **Default**: 1024 base + 768 tiles with cropping - Recommended for most use cases
294
- - **Quality**: 1280 base + 960 tiles with cropping - Higher quality, slower
295
- - **Fast**: 1024 base + 640 tiles with cropping - Faster processing
296
- - **No Crop**: 1024 base + 768 tiles without cropping - Single image processing
297
- - **Small**: 768 base + 512 tiles without cropping - Fastest, lower quality
298
-
299
  ### Tasks
300
  - **Markdown**: Convert document to structured markdown with layout detection (grounding βœ…)
301
  - **Free OCR**: Simple text extraction without layout
302
- - **OCR Image**: OCR for general images with grounding (grounding βœ…)
303
- - **Parse Figure**: Parse figures and charts in documents
304
  - **Locate**: Find and highlight specific text/elements in image (grounding βœ…)
305
  - **Describe**: General image description
306
  - **Custom**: Your own prompt (add `<|grounding|>` for bounding boxes)
@@ -318,14 +299,14 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
318
  task.change(toggle_prompt, [task], [prompt])
319
  task.change(select_boxes, [task], [tabs])
320
 
321
- def run(image, file_path, mode, task, custom_prompt, page_num):
322
  if file_path:
323
- return process_file(file_path, mode, task, custom_prompt, int(page_num))
324
  if image is not None:
325
- return process_image(image, mode, task, custom_prompt)
326
  return "Error: Upload a file or image", "", "", None, []
327
 
328
- submit_event = btn.click(run, [input_img, file_in, mode, task, prompt, page_selector],
329
  [text_out, md_out, raw_out, img_out, gallery])
330
  submit_event.then(select_boxes, [task], [tabs])
331
 
 
20
  model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
21
  model = model.eval().cuda()
22
 
23
+ DEFAULT_CONFIG = {"base_size": 1024, "image_size": 768, "crop_mode": True}
 
 
 
 
 
 
24
 
25
  TASK_PROMPTS = {
26
  "πŸ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
27
  "πŸ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
 
 
28
  "πŸ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
29
  "πŸ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
30
  "✏️ Custom": {"prompt": "", "has_grounding": False}
 
106
  return markdown
107
 
108
  @spaces.GPU(duration=90)
109
+ def process_image(image, task, custom_prompt):
110
  if image is None:
111
  return "Error: Upload an image", "", "", None, []
112
  if task in ["✏️ Custom", "πŸ“ Locate"] and not custom_prompt.strip():
 
116
  image = image.convert('RGB')
117
  image = ImageOps.exif_transpose(image)
118
 
119
+ config = DEFAULT_CONFIG
120
 
121
  if task == "✏️ Custom":
122
  prompt = f"<image>\n{custom_prompt.strip()}"
 
173
  return cleaned, markdown, result, img_out, crops
174
 
175
  @spaces.GPU(duration=90)
176
+ def process_pdf(path, task, custom_prompt, page_num):
177
  doc = fitz.open(path)
178
  total_pages = len(doc)
179
  if page_num < 1 or page_num > total_pages:
 
184
  img = Image.open(BytesIO(pix.tobytes("png")))
185
  doc.close()
186
 
187
+ return process_image(img, task, custom_prompt)
188
 
189
+ def process_file(path, task, custom_prompt, page_num):
190
  if not path:
191
  return "Error: Upload a file", "", "", None, []
192
  if path.lower().endswith('.pdf'):
193
+ return process_pdf(path, task, custom_prompt, page_num)
194
  else:
195
+ return process_image(Image.open(path), task, custom_prompt)
196
 
197
  def toggle_prompt(task):
198
  if task == "✏️ Custom":
 
241
  gr.Markdown("""
242
  # πŸš€ DeepSeek-OCR-2 Demo
243
  **Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
 
244
  **It's powered by DeepEncoder v2. It achieves 91.09% on OmniDocBench (+3.73% over v1).**
245
 
246
  **Hope this tool was helpful! If so, a quick like ❀️ would mean a lot :)**
 
251
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
252
  input_img = gr.Image(label="Input Image", type="pil", height=300)
253
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
 
254
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
255
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
256
  btn = gr.Button("Extract", variant="primary", size="lg")
 
270
 
271
  gr.Examples(
272
  examples=[
273
+ ["examples/ocr.jpg", "πŸ“‹ Markdown", ""],
274
+ ["examples/reachy-mini.jpg", "πŸ“ Locate", "Robot"]
275
  ],
276
+ inputs=[input_img, task, prompt],
277
  cache_examples=False
278
  )
279
 
280
  with gr.Accordion("ℹ️ Info", open=False):
281
  gr.Markdown("""
 
 
 
 
 
 
 
282
  ### Tasks
283
  - **Markdown**: Convert document to structured markdown with layout detection (grounding βœ…)
284
  - **Free OCR**: Simple text extraction without layout
 
 
285
  - **Locate**: Find and highlight specific text/elements in image (grounding βœ…)
286
  - **Describe**: General image description
287
  - **Custom**: Your own prompt (add `<|grounding|>` for bounding boxes)
 
299
  task.change(toggle_prompt, [task], [prompt])
300
  task.change(select_boxes, [task], [tabs])
301
 
302
+ def run(image, file_path, task, custom_prompt, page_num):
303
  if file_path:
304
+ return process_file(file_path, task, custom_prompt, int(page_num))
305
  if image is not None:
306
+ return process_image(image, task, custom_prompt)
307
  return "Error: Upload a file or image", "", "", None, []
308
 
309
+ submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
310
  [text_out, md_out, raw_out, img_out, gallery])
311
  submit_event.then(select_boxes, [task], [tabs])
312