Maltokar
/

GOT_OCR_MP

@@ -408,46 +408,71 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag=False, num_workers=4):
-        self.disable_torch_init()
-        image_processor_high = GOTImageEvalProcessor(image_size=1024)
-        use_im_start_end = True
-        image_token_len = 256
-    # Load the image either from Gradio input or directly
-        if gradio_input:
-            image = image_file.copy()
-        else:
-            image = self.load_image(image_file)
-        w, h = image.size
-    # Prepare OCR query
-        if ocr_type == 'format':
-            qs = 'OCR with format: '
-        else:
-            qs = 'OCR: '
-    # Process bounding box for OCR
-        if ocr_box:
-            bbox = eval(ocr_box)
-            bbox = [int(bbox[i]/w*1000) if i % 2 == 0 else int(bbox[i]/h*1000) for i in range(len(bbox))]
-            qs = str(bbox) + ' ' + qs
-    # Process OCR color if provided
-        if ocr_color:
-            qs = '[' + ocr_color + '] ' + qs
-    # Image token embedding
-        if use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
-        else:
-            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
-    # Conversation setup
-        conv_mpt = Conversation(
         system="""<|im_start|>system
         You should follow the instructions carefully and explain your answers in detail.""",
         roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
@@ -456,90 +481,106 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         offset=0,
         sep_style=SeparatorStyle.MPT,
         sep="<|im_end|>",
-        )
-        conv = conv_mpt.copy()
-        conv.append_message(conv.roles[0], qs)
-        conv.append_message(conv.roles[1], None)
-        prompt = conv.get_prompt()
-        if print_prompt:
-            print(prompt)
-    # Tokenize input
-        inputs = tokenizer([prompt])
-    # Process image
-        image_tensor_1 = image_processor_high(image)
-        input_ids = torch.as_tensor(inputs.input_ids).cpu()
-        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
-        keywords = [stop_str]
-        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
-        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        def generate_output(image_tensor, input_ids):
-            with torch.autocast("cpu", dtype=torch.bfloat16):
-                return self.generate(
-                input_ids,
-                images=[image_tensor.unsqueeze(0).half().cpu()],
-                do_sample=False,
-                num_beams=1,
-                no_repeat_ngram_size=20,
-                streamer=streamer if stream_flag else None,
-                max_new_tokens=4096,
-                stopping_criteria=[stopping_criteria]
-                )
-    # Multiprocessing to parallelize generation
-        with mp.Pool(processes=num_workers) as pool:
-            results = pool.starmap(generate_output, [(image_tensor_1, input_ids)])
-    # Post-processing the output
-        outputs = tokenizer.decode(results[0][0, input_ids.shape[1]:]).strip()
-        if outputs.endswith(stop_str):
-            outputs = outputs[:-len(stop_str)]
-        response_str = outputs.strip()
-    # Optional rendering for output formatting
-        if render:
-            print('==============rendering===============')
-            if '**kern' in outputs:
-                import verovio
-                tk = verovio.toolkit()
-                tk.loadData(outputs)
-                tk.setOptions({
-                    "pageWidth": 2100,
-                    "footer": 'none',
-                    'barLineWidth': 0.5,
-                    'beamMaxSlope': 15,
-                    'staffLineWidth': 0.2,
-                    'spacingStaff': 6
-                })
-                svg = tk.renderToSVG()
-                svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
-                svg_to_html(svg, save_render_file)
             else:
-            # If 'format' OCR is being used without '**kern'
                 html_path_2 = save_render_file
-                outputs = outputs.replace('"', '``').replace('$', '')
-            # Properly balance the brackets
-                outputs = outputs.replace('\left(', '(').replace('\\right)', ')')
-                outputs = outputs.replace('\left[', '[').replace('\\right]', ']')
-                outputs = outputs.replace('\left{', '{').replace('\\right}', '}')
-                outputs = outputs.replace('\left|', '|').replace('\\right|', '|')
-                lines = content_mmd_to_html if '\\begin{tikzpicture}' not in outputs else tik_html
                 lines = lines.split("const text =")
-                gt = '"' + '\\n'.join(outputs.split('\n')).replace('\\', '\\\\') + '"'
-                new_web = lines[0] + 'const text =' + gt + lines[1]
-                with open(html_path_2, 'w') as web_f_new:
-                    web_f_new.write(new_web)
-        return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):

         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    # Move the generate_output function outside the chat method
+def generate_output(input_ids, image_tensor, model, tokenizer, stopping_criteria, stream_flag):
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    if stream_flag:
+        with torch.autocast("cpu", dtype=torch.bfloat16):
+            output_ids = model.generate(
+                input_ids,
+                images=[image_tensor.unsqueeze(0).half().cpu()],
+                do_sample=False,
+                num_beams=1,
+                no_repeat_ngram_size=20,
+                streamer=streamer,
+                max_new_tokens=4096,
+                stopping_criteria=[stopping_criteria]
+            )
+    else:
+        with torch.autocast("cpu", dtype=torch.bfloat16):
+            output_ids = model.generate(
+                input_ids,
+                images=[image_tensor.unsqueeze(0).half().cpu()],
+                do_sample=False,
+                num_beams=1,
+                no_repeat_ngram_size=20,
+                max_new_tokens=4096,
+                stopping_criteria=[stopping_criteria]
+            )
+    return output_ids
+# The chat method optimized for CPU performance with multiprocessing
+def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None,
+         print_prompt=False, gradio_input=False, stream_flag=False, num_workers=1):
+    self.disable_torch_init()
+    image_processor_high = GOTImageEvalProcessor(image_size=1024)
+    image_token_len = 256
+    if gradio_input:
+        image = image_file.copy()
+    else:
+        image = self.load_image(image_file)
+    w, h = image.size
+    qs = 'OCR with format: ' if ocr_type == 'format' else 'OCR: '
+    if ocr_box:
+        bbox = eval(ocr_box)
+        if len(bbox) == 2:
+            bbox[0] = int(bbox[0]/w*1000)
+            bbox[1] = int(bbox[1]/h*1000)
+        if len(bbox) == 4:
+            bbox[0] = int(bbox[0]/w*1000)
+            bbox[1] = int(bbox[1]/h*1000)
+            bbox[2] = int(bbox[2]/w*1000)
+            bbox[3] = int(bbox[3]/h*1000)
+        qs = str(bbox) + ' ' + qs
+    if ocr_color:
+        qs = f"[{ocr_color}] " + qs
+    qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
+    # Setup conversation prompt
+    conv_mpt = Conversation(
         system="""<|im_start|>system
         You should follow the instructions carefully and explain your answers in detail.""",
         roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
         offset=0,
         sep_style=SeparatorStyle.MPT,
         sep="<|im_end|>",
+    )
+    conv = conv_mpt.copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    if print_prompt:
+        print(prompt)
+    inputs = tokenizer([prompt])
+    image_tensor_1 = image_processor_high(image)
+    input_ids = torch.as_tensor(inputs.input_ids).cpu()
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    # Multiprocessing setup
+    with Pool(num_workers) as pool:
+        results = pool.starmap(
+            generate_output,
+            [(input_ids, image_tensor_1, self, tokenizer, stopping_criteria, stream_flag)] * num_workers
+        )
+    output_ids = results[0]  # Take the first result (or aggregate depending on task)
+    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
+    if outputs.endswith(stop_str):
+        outputs = outputs[:-len(stop_str)]
+    outputs = outputs.strip()
+    response_str = outputs
+    # Rendering logic
+    if render:
+        print('==============rendering===============')
+        from .render_tools import svg_to_html, content_mmd_to_html, tik_html, translation_table
+        if '**kern' in outputs:
+            import verovio
+            tk = verovio.toolkit()
+            tk.loadData(outputs)
+            tk.setOptions({
+                "pageWidth": 2100, "footer": 'none',
+                'barLineWidth': 0.5, 'beamMaxSlope': 15,
+                'staffLineWidth': 0.2, 'spacingStaff': 6
+            })
+            tk.getPageCount()
+            svg = tk.renderToSVG()
+            svg = svg.replace("overflow=\"inherit\"", "overflow=\"visible\"")
+            svg_to_html(svg, save_render_file)
+        if ocr_type == 'format' and '**kern' not in outputs:
+            if '\\begin{tikzpicture}' not in outputs:
+                html_path_2 = save_render_file
+                right_num = outputs.count('\\right')
+                left_num = outputs.count('\\left')
+                if right_num != left_num:
+                    outputs = outputs.replace('\left(', '(').replace('\\right)', ')').replace('\left[', '[').replace('\\right]', ']').replace('\left{', '{').replace('\\right}', '}').replace('\left|', '|').replace('\\right|', '|').replace('\left.', '.').replace('\\right.', '.')
+                outputs = outputs.replace('"', '``').replace('$', '')
+                outputs_list = outputs.split('\n')
+                gt = ''
+                for out in outputs_list:
+                    gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
+                gt = gt[:-2]
+                lines = content_mmd_to_html
+                lines = lines.split("const text =")
+                new_web = lines[0] + 'const text ='  + gt  + lines[1]
             else:
                 html_path_2 = save_render_file
+                outputs = outputs.translate(translation_table)
+                outputs_list = outputs.split('\n')
+                gt = ''
+                for out in outputs_list:
+                    if out:
+                        if '\\begin{tikzpicture}' not in out and '\\end{tikzpicture}' not in out:
+                            while out[-1] == ' ':
+                                out = out[:-1]
+                                if out is None:
+                                    break
+                            if out:
+                                if out[-1] != ';':
+                                    gt += out[:-1] + ';\n'
+                                else:
+                                    gt += out + '\n'
+                        else:
+                            gt += out + '\n'
+                lines = tik_html
                 lines = lines.split("const text =")
+                new_web = lines[0] + gt + lines[1]
+            with open(html_path_2, 'w') as web_f_new:
+                web_f_new.write(new_web)
+    return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):