Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +102 -15
fastvlm_ax650_context_1k_prefill_640/image_encoder_512x512.axmodel +3 -0
infer_axmodel.py +18 -10
utils/__pycache__/infer_func.cpython-313.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -65,3 +65,4 @@ fastvlm_ax650_context_1k_prefill_640/llava_qwen2_p128_l9_together.axmodel filter
 fastvlm_ax650_context_1k_prefill_640/llava_qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
 images/image_1.jpg filter=lfs diff=lfs merge=lfs -text
 images/ssd_horse.jpg filter=lfs diff=lfs merge=lfs -text

 fastvlm_ax650_context_1k_prefill_640/llava_qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
 images/image_1.jpg filter=lfs diff=lfs merge=lfs -text
 images/ssd_horse.jpg filter=lfs diff=lfs merge=lfs -text
+fastvlm_ax650_context_1k_prefill_640/image_encoder_512x512.axmodel filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,15 +1,3 @@
----
-license: apache-2.0
-language:
-- en
-base_model:
-- apple/FastVLM-1.5B
-pipeline_tag: image-to-text
-tags:
-- vlm
-- fastvlm
-- en
----
 # FastVLM-1.5B
 This version of FastVLM-1.5B has been converted to run on the Axera NPU using **w8a16** quantization.
@@ -37,7 +25,7 @@ How to Convert LLM from Huggingface to axmodel[TODO]
 |Chips|image encoder 1024|ttft(291tokens)|w8a16|
 |--|--|--|--|
-|AX650| 216.257 ms | 861.213 ms | 11.90 tokens/sec|
 ## How to use
@@ -86,7 +74,106 @@ Init InferenceSession:   4%|████
 [INFO] VNPU type: VNPUType.DISABLED
 [INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
 Init InferenceSession:   7%|████████▏                                                                                                         | 2/28 [00:01<00:21,  1.20it/s][INFO] Using provider: AXCLRTExecutionProvider
-...
 [INFO] SOC Name: AX650N
 [INFO] VNPU type: VNPUType.DISABLED
 [INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
@@ -128,4 +215,4 @@ prompt<<q
 [INFO]: 对话结束，再见。
 ```
 ![ssd_horse.jpg](./images/ssd_horse.jpg)
-![iamge_1.jpg](./images/image_1.jpg)

 # FastVLM-1.5B
 This version of FastVLM-1.5B has been converted to run on the Axera NPU using **w8a16** quantization.
 |Chips|image encoder 1024|ttft(291tokens)|w8a16|
 |--|--|--|--|
+|AX650| 216.257 ms | 861.213 ms | 13.88 tokens/sec|
 ## How to use
 [INFO] VNPU type: VNPUType.DISABLED
 [INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
 Init InferenceSession:   7%|████████▏                                                                                                         | 2/28 [00:01<00:21,  1.20it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  11%|████████████▏                                                                                                     | 3/28 [00:02<00:19,  1.30it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  14%|████████████████▎                                                                                                 | 4/28 [00:03<00:17,  1.36it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  18%|████████████████████▎                                                                                             | 5/28 [00:03<00:16,  1.40it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  21%|████████████████████████▍                                                                                         | 6/28 [00:04<00:15,  1.42it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  25%|████████████████████████████▌                                                                                     | 7/28 [00:05<00:14,  1.43it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  29%|████████████████████████████████▌                                                                                 | 8/28 [00:05<00:13,  1.44it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  32%|████████████████████████████████████▋                                                                             | 9/28 [00:06<00:13,  1.44it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  36%|████████████████████████████████████████▎                                                                        | 10/28 [00:07<00:12,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  39%|████████████████████████████████████████████▍                                                                    | 11/28 [00:07<00:11,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  43%|████████████████████████████████████████████████▍                                                                | 12/28 [00:08<00:11,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  46%|████████████████████████████████████████████████████▍                                                            | 13/28 [00:09<00:10,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  50%|████████████████████████████████████████████████████████▌                                                        | 14/28 [00:09<00:09,  1.46it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  54%|████████████████████████████████████████████████████████████▌                                                    | 15/28 [00:10<00:08,  1.46it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  57%|████████████████████████████████████████████████████████████████▌                                                | 16/28 [00:11<00:08,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  61%|████████████████████████████████████████████████████████████████████▌                                            | 17/28 [00:12<00:07,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  64%|████████████████████████████████████████████████████████████████████████▋                                        | 18/28 [00:12<00:06,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  68%|████████████████████████████████████████████████████████████████████████████▋                                    | 19/28 [00:13<00:06,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  71%|████████████████████████████████████████████████████████████████████████████████▋                                | 20/28 [00:14<00:05,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  75%|████████████████████████████████████████████████████████████████████████████████████▊                            | 21/28 [00:14<00:04,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  79%|████████████████████████████████████████████████████████████████████████████████████████▊                        | 22/28 [00:15<00:04,  1.46it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  82%|████████████████████████████████████████████████████████████████████████████████████████████▊                    | 23/28 [00:16<00:03,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  86%|████████████████████████████████████████████████████████████████████████████████████████████████▊                | 24/28 [00:16<00:02,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 25/28 [00:17<00:02,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 26/28 [00:18<00:01,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
+[INFO] SOC Name: AX650N
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
+Init InferenceSession:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 27/28 [00:18<00:00,  1.45it/s][INFO] Using provider: AXCLRTExecutionProvider
 [INFO] SOC Name: AX650N
 [INFO] VNPU type: VNPUType.DISABLED
 [INFO] Compiler version: 5.1-patch1-dirty 140e8d4a-dirty
 [INFO]: 对话结束，再见。
 ```
 ![ssd_horse.jpg](./images/ssd_horse.jpg)
+![iamge_1.jpg](./images/image_1.jpg)

fastvlm_ax650_context_1k_prefill_640/image_encoder_512x512.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86a9657370520266913b4fcd7a9725216cdb2bae273bc082c13050ada4e9a69c
+size 170758109

infer_axmodel.py CHANGED Viewed

@@ -24,10 +24,10 @@ def load_model_and_tokenizer(model_path):
     return config, tokenizer
-def vision_encoder(image_path, ax_session):
-    image_processor = CLIPImageProcessor(size={"shortest_edge": 1024},        # CLIP 支持 336x336
-                                         crop_size={"height": 1024, "width": 1024},
                                          image_mean=[0, 0, 0],
                                          image_std=[1/255, 1/255, 1/255]
                                          )
@@ -43,7 +43,7 @@ def vision_encoder(image_path, ax_session):
     return vit_output
-def llm_infer(image_features, llm_path, config, tokenizer, imer, get_input):
     embeds = np.load(os.path.join(llm_path, "model.embed_tokens.weight.npy"))
@@ -53,7 +53,7 @@ def llm_infer(image_features, llm_path, config, tokenizer, imer, get_input):
     if image_features is not None:
     #     # for idx in range(len(image_features)):
-        prompt += "\n<img>" + "<image>"*256 + "</img>\n"
     prompt += "<|im_end|>\n<|im_start|>assistant\n"
     token_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX)
@@ -66,7 +66,7 @@ def llm_infer(image_features, llm_path, config, tokenizer, imer, get_input):
     if image_features is not None:
         image_start_index = np.where(np.array(token_ids) == -200)[0][0] # <image> tag 151646
         image_insert_index = image_start_index + 1
-        prefill_data[image_insert_index : image_insert_index + 256] = image_features[0, :, :]
     eos_token_id = None
     if isinstance(config.eos_token_id, list) and len(config.eos_token_id) > 1:
@@ -88,11 +88,19 @@ if __name__ == "__main__":
     args.add_argument("--vision_model", "-v", type=str, default="./fastvlm_ax650_context_1k_prefill_640/image_encoder_1024x1024.axmodel", help="Path to the vision axmodel.")
     args.add_argument("--model_path", "-m", type=str, default="./fastvlm_ax650_context_1k_prefill_640", help="Path to the llm axmodel.")
     args.add_argument("--tokenizer_path", "-t", type=str, default="./fastvlm_tokenizer", help="Path to the tokenizer.")
-    # args.add_argument("--images", type=str, default=None, help="Paths to the input images.")
     # args.add_argument("--question", type=str, default="介绍一下你自己", help="The question to ask the model.")
     args = args.parse_args()
     print("Loading config, tokenizer and init model.")
     config, tokenizer = load_model_and_tokenizer(model_path=args.tokenizer_path)
@@ -115,9 +123,9 @@ if __name__ == "__main__":
                 if not os.path.isfile(get_input):
                     print("[INFO]: 输入错误，请检查图片输入路径。")
                     continue
-                image_features = vision_encoder(get_input, ax_session)
                 get_input = "Describe the image in detail."
-                llm_infer(image_features, args.model_path, config, tokenizer, imer, get_input)
             else:
                 image_features = None
-                llm_infer(image_features, args.model_path, config, tokenizer, imer, get_input)

     return config, tokenizer
+def vision_encoder(image_path, ax_session, args):
+    image_processor = CLIPImageProcessor(size={"shortest_edge": int(args.input_size)},        # CLIP 支持 336x336
+                                         crop_size={"height": int(args.input_size), "width": int(args.input_size)},
                                          image_mean=[0, 0, 0],
                                          image_std=[1/255, 1/255, 1/255]
                                          )
     return vit_output
+def llm_infer(image_features, llm_path, config, tokenizer, imer, get_input, token_length):
     embeds = np.load(os.path.join(llm_path, "model.embed_tokens.weight.npy"))
     if image_features is not None:
     #     # for idx in range(len(image_features)):
+        prompt += "\n<img>" + "<image>"*token_length + "</img>\n"
     prompt += "<|im_end|>\n<|im_start|>assistant\n"
     token_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX)
     if image_features is not None:
         image_start_index = np.where(np.array(token_ids) == -200)[0][0] # <image> tag 151646
         image_insert_index = image_start_index + 1
+        prefill_data[image_insert_index : image_insert_index + token_length] = image_features[0, :, :]
     eos_token_id = None
     if isinstance(config.eos_token_id, list) and len(config.eos_token_id) > 1:
     args.add_argument("--vision_model", "-v", type=str, default="./fastvlm_ax650_context_1k_prefill_640/image_encoder_1024x1024.axmodel", help="Path to the vision axmodel.")
     args.add_argument("--model_path", "-m", type=str, default="./fastvlm_ax650_context_1k_prefill_640", help="Path to the llm axmodel.")
     args.add_argument("--tokenizer_path", "-t", type=str, default="./fastvlm_tokenizer", help="Path to the tokenizer.")
+    args.add_argument("--input_size", "-i", type=str, default="1024", help="Input size of the vision encoder model.")
     # args.add_argument("--question", type=str, default="介绍一下你自己", help="The question to ask the model.")
     args = args.parse_args()
+    token_len_map = {"2048": 1280,
+                     "1024": 256,
+                     "768": 144,
+                     "512": 64,
+                     "256": 16}
+    token_length = token_len_map[args.input_size]
     print("Loading config, tokenizer and init model.")
     config, tokenizer = load_model_and_tokenizer(model_path=args.tokenizer_path)
                 if not os.path.isfile(get_input):
                     print("[INFO]: 输入错误，请检查图片输入路径。")
                     continue
+                image_features = vision_encoder(get_input, ax_session, args)
                 get_input = "Describe the image in detail."
+                llm_infer(image_features, args.model_path, config, tokenizer, imer, get_input, token_length)
             else:
                 image_features = None
+                llm_infer(image_features, args.model_path, config, tokenizer, imer, get_input, token_length)

utils/__pycache__/infer_func.cpython-313.pyc CHANGED Viewed

Binary files a/utils/__pycache__/infer_func.cpython-313.pyc and b/utils/__pycache__/infer_func.cpython-313.pyc differ