chc012 commited on
Commit
85ca5ad
·
1 Parent(s): e1d539f

Add sanity check script and example screenshot with LFS tracking

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +81 -41
  3. requirements.txt +4 -1
  4. sanity.py +77 -40
  5. screenshot.png +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -186,14 +186,19 @@ pip install -r requirements.txt
186
  Note that this is only a sanity test for ensuring model is working properly.
187
  For replicating the evaluation result or using the model for your own project, please refer to our code repository on [GitHub](?????(repository)).
188
 
189
- The following code snippet is also available in the attached sanity.py
 
 
 
190
 
191
  ```{python}
192
  import base64
 
193
  import torch
194
- from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
195
  from PIL import Image
196
 
 
197
  def encode_image(image_path: str) -> str:
198
  """Encode image to base64 string for model input."""
199
  with open(image_path, "rb") as f:
@@ -202,15 +207,13 @@ def encode_image(image_path: str) -> str:
202
 
203
  def load_model(
204
  model_path: str,
205
- ) -> tuple[AutoModel, AutoTokenizer, AutoImageProcessor]:
206
  """Load OpenCUA model, tokenizer, and image processor."""
207
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
208
- model = AutoModel.from_pretrained(
209
  model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
210
  )
211
- image_processor = AutoImageProcessor.from_pretrained(
212
- model_path, trust_remote_code=True
213
- )
214
 
215
  return model, tokenizer, image_processor
216
 
@@ -227,11 +230,15 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
227
  {
228
  "role": "user",
229
  "content": [
 
 
 
 
 
230
  {
231
  "type": "image",
232
  "image": f"data:image/png;base64,{encode_image(image_path)}",
233
  },
234
- {"type": "text", "text": instruction},
235
  ],
236
  },
237
  ]
@@ -239,60 +246,93 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
239
 
240
 
241
  def run_inference(
242
- model: AutoModel,
243
  tokenizer: AutoTokenizer,
244
- image_processor: AutoImageProcessor,
245
  messages: list[dict],
246
  image_path: str,
247
  ) -> str:
248
  """Run inference on the model."""
249
- # Prepare text input
250
- input_ids = tokenizer.apply_chat_template(
251
- messages, tokenize=True, add_generation_prompt=True
252
  )
253
- input_ids = torch.tensor([input_ids]).to(model.device)
254
 
255
- # Prepare image input
256
  image = Image.open(image_path).convert("RGB")
257
- image_info = image_processor.preprocess(images=[image])
258
- pixel_values = torch.tensor(image_info["pixel_values"]).to(
259
- dtype=torch.bfloat16, device=model.device
 
260
  )
261
- grid_thws = torch.tensor(image_info["image_grid_thw"])
 
 
262
 
263
  # Generate response
264
  with torch.no_grad():
265
  generated_ids = model.generate(
266
- input_ids,
267
- pixel_values=pixel_values,
268
- grid_thws=grid_thws,
269
  max_new_tokens=2048,
270
- temperature=0,
271
  )
272
 
273
- # Decode output
274
- prompt_len = input_ids.shape[1]
275
- generated_ids = generated_ids[:, prompt_len:]
276
- output_text = tokenizer.batch_decode(
277
- generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
 
 
 
278
  )[0]
279
 
280
  return output_text
281
 
282
 
283
- # Example usage
284
- model_path = "Uniphore/actio-ui-7b-sft" # or other model variants
285
- image_path = "screenshot.png"
286
- instruction = "Click on the submit button"
287
-
288
- # Load model
289
- model, tokenizer, image_processor = load_model(model_path)
290
-
291
- # Create messages and run inference
292
- messages = create_grounding_messages(image_path, instruction)
293
- result = run_inference(model, tokenizer, image_processor, messages, image_path)
294
-
295
- print("Model output:", result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  ```
297
 
298
 
 
186
  Note that this is only a sanity test for ensuring model is working properly.
187
  For replicating the evaluation result or using the model for your own project, please refer to our code repository on [GitHub](?????(repository)).
188
 
189
+ The following code snippet is also available in the attached sanity.py. So you can just run it with
190
+ ```{bash}
191
+ python sanity.py
192
+ ```
193
 
194
  ```{python}
195
  import base64
196
+ import sys
197
  import torch
198
+ from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
199
  from PIL import Image
200
 
201
+
202
  def encode_image(image_path: str) -> str:
203
  """Encode image to base64 string for model input."""
204
  with open(image_path, "rb") as f:
 
207
 
208
  def load_model(
209
  model_path: str,
210
+ ) -> tuple[AutoModelForVision2Seq, AutoTokenizer, AutoProcessor]:
211
  """Load OpenCUA model, tokenizer, and image processor."""
212
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
213
+ model = AutoModelForVision2Seq.from_pretrained(
214
  model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
215
  )
216
+ image_processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
 
217
 
218
  return model, tokenizer, image_processor
219
 
 
230
  {
231
  "role": "user",
232
  "content": [
233
+ {
234
+ "type": "text",
235
+ "text": "Please perform the following task by providing the action and the coordinates: "
236
+ + instruction,
237
+ },
238
  {
239
  "type": "image",
240
  "image": f"data:image/png;base64,{encode_image(image_path)}",
241
  },
 
242
  ],
243
  },
244
  ]
 
246
 
247
 
248
  def run_inference(
249
+ model: AutoModelForVision2Seq,
250
  tokenizer: AutoTokenizer,
251
+ image_processor: AutoProcessor,
252
  messages: list[dict],
253
  image_path: str,
254
  ) -> str:
255
  """Run inference on the model."""
256
+ # Prepare text from messages
257
+ text = image_processor.apply_chat_template(
258
+ messages, tokenize=False, add_generation_prompt=True
259
  )
 
260
 
261
+ # Open image
262
  image = Image.open(image_path).convert("RGB")
263
+
264
+ # Process inputs using the processor
265
+ inputs = image_processor(
266
+ text=[text], images=[image], padding=True, return_tensors="pt"
267
  )
268
+
269
+ # Move inputs to model device
270
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
271
 
272
  # Generate response
273
  with torch.no_grad():
274
  generated_ids = model.generate(
275
+ **inputs,
 
 
276
  max_new_tokens=2048,
277
+ do_sample=False,
278
  )
279
 
280
+ # Decode output (skip the input tokens)
281
+ generated_ids_trimmed = [
282
+ out_ids[len(in_ids) :]
283
+ for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
284
+ ]
285
+ output_text = image_processor.batch_decode(
286
+ generated_ids_trimmed,
287
+ skip_special_tokens=True,
288
+ clean_up_tokenization_spaces=False,
289
  )[0]
290
 
291
  return output_text
292
 
293
 
294
+ def main():
295
+ """Main function to run the sanity check."""
296
+ # Configuration
297
+ model_path = "Uniphore/actio-ui-7b-sft" # or other model variants
298
+ image_path = "screenshot.png"
299
+ instruction = "Click on the submit button"
300
+
301
+ # Check if custom instruction provided
302
+ if len(sys.argv) > 1:
303
+ instruction = " ".join(sys.argv[1:])
304
+
305
+ print(f"Loading model from: {model_path}")
306
+ try:
307
+ model, tokenizer, image_processor = load_model(model_path)
308
+ print("✓ Model loaded successfully")
309
+ except Exception as e:
310
+ print(f"✗ Error loading model: {e}")
311
+ return 1
312
+
313
+ print(f"Processing image: {image_path}")
314
+ print(f"Instruction: {instruction}")
315
+
316
+ try:
317
+ messages = create_grounding_messages(image_path, instruction)
318
+ result = run_inference(model, tokenizer, image_processor, messages, image_path)
319
+
320
+ print("\n" + "=" * 60)
321
+ print("MODEL OUTPUT:")
322
+ print("=" * 60)
323
+ print(result)
324
+ print("=" * 60)
325
+ return 0
326
+ except Exception as e:
327
+ print(f"✗ Error during inference: {e}")
328
+ import traceback
329
+
330
+ traceback.print_exc()
331
+ return 1
332
+
333
+
334
+ if __name__ == "__main__":
335
+ sys.exit(main())
336
  ```
337
 
338
 
requirements.txt CHANGED
@@ -1,4 +1,7 @@
 
 
 
1
  Pillow
2
  torch
 
3
  transformers
4
- vllm
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu121
2
+
3
+ accelerate
4
  Pillow
5
  torch
6
+ torchvision
7
  transformers
 
sanity.py CHANGED
@@ -1,8 +1,10 @@
1
  import base64
 
2
  import torch
3
- from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
4
  from PIL import Image
5
 
 
6
  def encode_image(image_path: str) -> str:
7
  """Encode image to base64 string for model input."""
8
  with open(image_path, "rb") as f:
@@ -11,15 +13,13 @@ def encode_image(image_path: str) -> str:
11
 
12
  def load_model(
13
  model_path: str,
14
- ) -> tuple[AutoModel, AutoTokenizer, AutoImageProcessor]:
15
  """Load OpenCUA model, tokenizer, and image processor."""
16
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
17
- model = AutoModel.from_pretrained(
18
  model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
19
  )
20
- image_processor = AutoImageProcessor.from_pretrained(
21
- model_path, trust_remote_code=True
22
- )
23
 
24
  return model, tokenizer, image_processor
25
 
@@ -36,11 +36,15 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
36
  {
37
  "role": "user",
38
  "content": [
 
 
 
 
 
39
  {
40
  "type": "image",
41
  "image": f"data:image/png;base64,{encode_image(image_path)}",
42
  },
43
- {"type": "text", "text": instruction},
44
  ],
45
  },
46
  ]
@@ -48,57 +52,90 @@ def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
48
 
49
 
50
  def run_inference(
51
- model: AutoModel,
52
  tokenizer: AutoTokenizer,
53
- image_processor: AutoImageProcessor,
54
  messages: list[dict],
55
  image_path: str,
56
  ) -> str:
57
  """Run inference on the model."""
58
- # Prepare text input
59
- input_ids = tokenizer.apply_chat_template(
60
- messages, tokenize=True, add_generation_prompt=True
61
  )
62
- input_ids = torch.tensor([input_ids]).to(model.device)
63
 
64
- # Prepare image input
65
  image = Image.open(image_path).convert("RGB")
66
- image_info = image_processor.preprocess(images=[image])
67
- pixel_values = torch.tensor(image_info["pixel_values"]).to(
68
- dtype=torch.bfloat16, device=model.device
 
69
  )
70
- grid_thws = torch.tensor(image_info["image_grid_thw"])
 
 
71
 
72
  # Generate response
73
  with torch.no_grad():
74
  generated_ids = model.generate(
75
- input_ids,
76
- pixel_values=pixel_values,
77
- grid_thws=grid_thws,
78
  max_new_tokens=2048,
79
- temperature=0,
80
  )
81
 
82
- # Decode output
83
- prompt_len = input_ids.shape[1]
84
- generated_ids = generated_ids[:, prompt_len:]
85
- output_text = tokenizer.batch_decode(
86
- generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
 
 
 
87
  )[0]
88
 
89
  return output_text
90
 
91
 
92
- # Example usage
93
- model_path = "Uniphore/actio-ui-7b-sft" # or other model variants
94
- image_path = "screenshot.png"
95
- instruction = "Click on the submit button"
96
-
97
- # Load model
98
- model, tokenizer, image_processor = load_model(model_path)
99
-
100
- # Create messages and run inference
101
- messages = create_grounding_messages(image_path, instruction)
102
- result = run_inference(model, tokenizer, image_processor, messages, image_path)
103
-
104
- print("Model output:", result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import base64
2
+ import sys
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
5
  from PIL import Image
6
 
7
+
8
  def encode_image(image_path: str) -> str:
9
  """Encode image to base64 string for model input."""
10
  with open(image_path, "rb") as f:
 
13
 
14
  def load_model(
15
  model_path: str,
16
+ ) -> tuple[AutoModelForVision2Seq, AutoTokenizer, AutoProcessor]:
17
  """Load OpenCUA model, tokenizer, and image processor."""
18
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
19
+ model = AutoModelForVision2Seq.from_pretrained(
20
  model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
21
  )
22
+ image_processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
 
23
 
24
  return model, tokenizer, image_processor
25
 
 
36
  {
37
  "role": "user",
38
  "content": [
39
+ {
40
+ "type": "text",
41
+ "text": "Please perform the following task by providing the action and the coordinates: "
42
+ + instruction,
43
+ },
44
  {
45
  "type": "image",
46
  "image": f"data:image/png;base64,{encode_image(image_path)}",
47
  },
 
48
  ],
49
  },
50
  ]
 
52
 
53
 
54
  def run_inference(
55
+ model: AutoModelForVision2Seq,
56
  tokenizer: AutoTokenizer,
57
+ image_processor: AutoProcessor,
58
  messages: list[dict],
59
  image_path: str,
60
  ) -> str:
61
  """Run inference on the model."""
62
+ # Prepare text from messages
63
+ text = image_processor.apply_chat_template(
64
+ messages, tokenize=False, add_generation_prompt=True
65
  )
 
66
 
67
+ # Open image
68
  image = Image.open(image_path).convert("RGB")
69
+
70
+ # Process inputs using the processor
71
+ inputs = image_processor(
72
+ text=[text], images=[image], padding=True, return_tensors="pt"
73
  )
74
+
75
+ # Move inputs to model device
76
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
77
 
78
  # Generate response
79
  with torch.no_grad():
80
  generated_ids = model.generate(
81
+ **inputs,
 
 
82
  max_new_tokens=2048,
83
+ do_sample=False,
84
  )
85
 
86
+ # Decode output (skip the input tokens)
87
+ generated_ids_trimmed = [
88
+ out_ids[len(in_ids) :]
89
+ for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
90
+ ]
91
+ output_text = image_processor.batch_decode(
92
+ generated_ids_trimmed,
93
+ skip_special_tokens=True,
94
+ clean_up_tokenization_spaces=False,
95
  )[0]
96
 
97
  return output_text
98
 
99
 
100
+ def main():
101
+ """Main function to run the sanity check."""
102
+ # Configuration
103
+ model_path = "Uniphore/actio-ui-7b-sft" # or other model variants
104
+ image_path = "screenshot.png"
105
+ instruction = "Click on the submit button"
106
+
107
+ # Check if custom instruction provided
108
+ if len(sys.argv) > 1:
109
+ instruction = " ".join(sys.argv[1:])
110
+
111
+ print(f"Loading model from: {model_path}")
112
+ try:
113
+ model, tokenizer, image_processor = load_model(model_path)
114
+ print("✓ Model loaded successfully")
115
+ except Exception as e:
116
+ print(f"✗ Error loading model: {e}")
117
+ return 1
118
+
119
+ print(f"Processing image: {image_path}")
120
+ print(f"Instruction: {instruction}")
121
+
122
+ try:
123
+ messages = create_grounding_messages(image_path, instruction)
124
+ result = run_inference(model, tokenizer, image_processor, messages, image_path)
125
+
126
+ print("\n" + "=" * 60)
127
+ print("MODEL OUTPUT:")
128
+ print("=" * 60)
129
+ print(result)
130
+ print("=" * 60)
131
+ return 0
132
+ except Exception as e:
133
+ print(f"✗ Error during inference: {e}")
134
+ import traceback
135
+
136
+ traceback.print_exc()
137
+ return 1
138
+
139
+
140
+ if __name__ == "__main__":
141
+ sys.exit(main())
screenshot.png ADDED

Git LFS Details

  • SHA256: a88befff865f5276d82d4b05c47a96117e35c06979a49f36c67b0f3f1e0f751a
  • Pointer size: 131 Bytes
  • Size of remote file: 475 kB