Add sanity check script and example screenshot with LFS tracking

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +81 -41
requirements.txt +4 -1
sanity.py +77 -40
screenshot.png +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -186,14 +186,19 @@ pip install -r requirements.txt
 Note that this is only a sanity test for ensuring model is working properly.
 For replicating the evaluation result or using the model for your own project, please refer to our code repository on [GitHub](?????(repository)).
-The following code snippet is also available in the attached sanity.py
 ```{python}
 import base64
 import torch
-from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
 from PIL import Image
 def encode_image(image_path: str) -> str:
     """Encode image to base64 string for model input."""
     with open(image_path, "rb") as f:
@@ -202,15 +207,13 @@ def encode_image(image_path: str) -> str:
 def load_model(
     model_path: str,
-) -> tuple[AutoModel, AutoTokenizer, AutoImageProcessor]:
     """Load OpenCUA model, tokenizer, and image processor."""
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModel.from_pretrained(
         model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
     )
-    image_processor = AutoImageProcessor.from_pretrained(
-        model_path, trust_remote_code=True
-    )
     return model, tokenizer, image_processor
@@ -227,11 +230,15 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
         {
             "role": "user",
             "content": [
                 {
                     "type": "image",
                     "image": f"data:image/png;base64,{encode_image(image_path)}",
                 },
-                {"type": "text", "text": instruction},
             ],
         },
     ]
@@ -239,60 +246,93 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
 def run_inference(
-    model: AutoModel,
     tokenizer: AutoTokenizer,
-    image_processor: AutoImageProcessor,
     messages: list[dict],
     image_path: str,
 ) -> str:
     """Run inference on the model."""
-    # Prepare text input
-    input_ids = tokenizer.apply_chat_template(
-        messages, tokenize=True, add_generation_prompt=True
     )
-    input_ids = torch.tensor([input_ids]).to(model.device)
-    # Prepare image input
     image = Image.open(image_path).convert("RGB")
-    image_info = image_processor.preprocess(images=[image])
-    pixel_values = torch.tensor(image_info["pixel_values"]).to(
-        dtype=torch.bfloat16, device=model.device
     )
-    grid_thws = torch.tensor(image_info["image_grid_thw"])
     # Generate response
     with torch.no_grad():
         generated_ids = model.generate(
-            input_ids,
-            pixel_values=pixel_values,
-            grid_thws=grid_thws,
             max_new_tokens=2048,
-            temperature=0,
         )
-    # Decode output
-    prompt_len = input_ids.shape[1]
-    generated_ids = generated_ids[:, prompt_len:]
-    output_text = tokenizer.batch_decode(
-        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
     return output_text
-# Example usage
-model_path = "Uniphore/actio-ui-7b-sft"  # or other model variants
-image_path = "screenshot.png"
-instruction = "Click on the submit button"
-# Load model
-model, tokenizer, image_processor = load_model(model_path)
-# Create messages and run inference
-messages = create_grounding_messages(image_path, instruction)
-result = run_inference(model, tokenizer, image_processor, messages, image_path)
-print("Model output:", result)
 ```

 Note that this is only a sanity test for ensuring model is working properly.
 For replicating the evaluation result or using the model for your own project, please refer to our code repository on [GitHub](?????(repository)).
+The following code snippet is also available in the attached sanity.py. So you can just run it with
+```{bash}
+python sanity.py
+```
 ```{python}
 import base64
+import sys
 import torch
+from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
 from PIL import Image
 def encode_image(image_path: str) -> str:
     """Encode image to base64 string for model input."""
     with open(image_path, "rb") as f:
 def load_model(
     model_path: str,
+) -> tuple[AutoModelForVision2Seq, AutoTokenizer, AutoProcessor]:
     """Load OpenCUA model, tokenizer, and image processor."""
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModelForVision2Seq.from_pretrained(
         model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
     )
+    image_processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
     return model, tokenizer, image_processor
         {
             "role": "user",
             "content": [
+                {
+                    "type": "text",
+                    "text": "Please perform the following task by providing the action and the coordinates: "
+                    + instruction,
+                },
                 {
                     "type": "image",
                     "image": f"data:image/png;base64,{encode_image(image_path)}",
                 },
             ],
         },
     ]
 def run_inference(
+    model: AutoModelForVision2Seq,
     tokenizer: AutoTokenizer,
+    image_processor: AutoProcessor,
     messages: list[dict],
     image_path: str,
 ) -> str:
     """Run inference on the model."""
+    # Prepare text from messages
+    text = image_processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
     )
+    # Open image
     image = Image.open(image_path).convert("RGB")
+    # Process inputs using the processor
+    inputs = image_processor(
+        text=[text], images=[image], padding=True, return_tensors="pt"
     )
+    # Move inputs to model device
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     # Generate response
     with torch.no_grad():
         generated_ids = model.generate(
+            **inputs,
             max_new_tokens=2048,
+            do_sample=False,
         )
+    # Decode output (skip the input tokens)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
+    ]
+    output_text = image_processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
     )[0]
     return output_text
+def main():
+    """Main function to run the sanity check."""
+    # Configuration
+    model_path = "Uniphore/actio-ui-7b-sft"  # or other model variants
+    image_path = "screenshot.png"
+    instruction = "Click on the submit button"
+    # Check if custom instruction provided
+    if len(sys.argv) > 1:
+        instruction = " ".join(sys.argv[1:])
+    print(f"Loading model from: {model_path}")
+    try:
+        model, tokenizer, image_processor = load_model(model_path)
+        print("✓ Model loaded successfully")
+    except Exception as e:
+        print(f"✗ Error loading model: {e}")
+        return 1
+    print(f"Processing image: {image_path}")
+    print(f"Instruction: {instruction}")
+    try:
+        messages = create_grounding_messages(image_path, instruction)
+        result = run_inference(model, tokenizer, image_processor, messages, image_path)
+        print("\n" + "=" * 60)
+        print("MODEL OUTPUT:")
+        print("=" * 60)
+        print(result)
+        print("=" * 60)
+        return 0
+    except Exception as e:
+        print(f"✗ Error during inference: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())
 ```

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 Pillow
 torch
 transformers
-vllm

+--extra-index-url https://download.pytorch.org/whl/cu121
+accelerate
 Pillow
 torch
+torchvision
 transformers

sanity.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import base64
 import torch
-from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
 from PIL import Image
 def encode_image(image_path: str) -> str:
     """Encode image to base64 string for model input."""
     with open(image_path, "rb") as f:
@@ -11,15 +13,13 @@ def encode_image(image_path: str) -> str:
 def load_model(
     model_path: str,
-) -> tuple[AutoModel, AutoTokenizer, AutoImageProcessor]:
     """Load OpenCUA model, tokenizer, and image processor."""
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModel.from_pretrained(
         model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
     )
-    image_processor = AutoImageProcessor.from_pretrained(
-        model_path, trust_remote_code=True
-    )
     return model, tokenizer, image_processor
@@ -36,11 +36,15 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
         {
             "role": "user",
             "content": [
                 {
                     "type": "image",
                     "image": f"data:image/png;base64,{encode_image(image_path)}",
                 },
-                {"type": "text", "text": instruction},
             ],
         },
     ]
@@ -48,57 +52,90 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
 def run_inference(
-    model: AutoModel,
     tokenizer: AutoTokenizer,
-    image_processor: AutoImageProcessor,
     messages: list[dict],
     image_path: str,
 ) -> str:
     """Run inference on the model."""
-    # Prepare text input
-    input_ids = tokenizer.apply_chat_template(
-        messages, tokenize=True, add_generation_prompt=True
     )
-    input_ids = torch.tensor([input_ids]).to(model.device)
-    # Prepare image input
     image = Image.open(image_path).convert("RGB")
-    image_info = image_processor.preprocess(images=[image])
-    pixel_values = torch.tensor(image_info["pixel_values"]).to(
-        dtype=torch.bfloat16, device=model.device
     )
-    grid_thws = torch.tensor(image_info["image_grid_thw"])
     # Generate response
     with torch.no_grad():
         generated_ids = model.generate(
-            input_ids,
-            pixel_values=pixel_values,
-            grid_thws=grid_thws,
             max_new_tokens=2048,
-            temperature=0,
         )
-    # Decode output
-    prompt_len = input_ids.shape[1]
-    generated_ids = generated_ids[:, prompt_len:]
-    output_text = tokenizer.batch_decode(
-        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
     return output_text
-# Example usage
-model_path = "Uniphore/actio-ui-7b-sft"  # or other model variants
-image_path = "screenshot.png"
-instruction = "Click on the submit button"
-# Load model
-model, tokenizer, image_processor = load_model(model_path)
-# Create messages and run inference
-messages = create_grounding_messages(image_path, instruction)
-result = run_inference(model, tokenizer, image_processor, messages, image_path)
-print("Model output:", result)

 import base64
+import sys
 import torch
+from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
 from PIL import Image
 def encode_image(image_path: str) -> str:
     """Encode image to base64 string for model input."""
     with open(image_path, "rb") as f:
 def load_model(
     model_path: str,
+) -> tuple[AutoModelForVision2Seq, AutoTokenizer, AutoProcessor]:
     """Load OpenCUA model, tokenizer, and image processor."""
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModelForVision2Seq.from_pretrained(
         model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
     )
+    image_processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
     return model, tokenizer, image_processor
         {
             "role": "user",
             "content": [
+                {
+                    "type": "text",
+                    "text": "Please perform the following task by providing the action and the coordinates: "
+                    + instruction,
+                },
                 {
                     "type": "image",
                     "image": f"data:image/png;base64,{encode_image(image_path)}",
                 },
             ],
         },
     ]
 def run_inference(
+    model: AutoModelForVision2Seq,
     tokenizer: AutoTokenizer,
+    image_processor: AutoProcessor,
     messages: list[dict],
     image_path: str,
 ) -> str:
     """Run inference on the model."""
+    # Prepare text from messages
+    text = image_processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
     )
+    # Open image
     image = Image.open(image_path).convert("RGB")
+    # Process inputs using the processor
+    inputs = image_processor(
+        text=[text], images=[image], padding=True, return_tensors="pt"
     )
+    # Move inputs to model device
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     # Generate response
     with torch.no_grad():
         generated_ids = model.generate(
+            **inputs,
             max_new_tokens=2048,
+            do_sample=False,
         )
+    # Decode output (skip the input tokens)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
+    ]
+    output_text = image_processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
     )[0]
     return output_text
+def main():
+    """Main function to run the sanity check."""
+    # Configuration
+    model_path = "Uniphore/actio-ui-7b-sft"  # or other model variants
+    image_path = "screenshot.png"
+    instruction = "Click on the submit button"
+    # Check if custom instruction provided
+    if len(sys.argv) > 1:
+        instruction = " ".join(sys.argv[1:])
+    print(f"Loading model from: {model_path}")
+    try:
+        model, tokenizer, image_processor = load_model(model_path)
+        print("✓ Model loaded successfully")
+    except Exception as e:
+        print(f"✗ Error loading model: {e}")
+        return 1
+    print(f"Processing image: {image_path}")
+    print(f"Instruction: {instruction}")
+    try:
+        messages = create_grounding_messages(image_path, instruction)
+        result = run_inference(model, tokenizer, image_processor, messages, image_path)
+        print("\n" + "=" * 60)
+        print("MODEL OUTPUT:")
+        print("=" * 60)
+        print(result)
+        print("=" * 60)
+        return 0
+    except Exception as e:
+        print(f"✗ Error during inference: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())

screenshot.png ADDED Viewed

Git LFS Details

SHA256: a88befff865f5276d82d4b05c47a96117e35c06979a49f36c67b0f3f1e0f751a
Pointer size: 131 Bytes
Size of remote file: 475 kB