Qwen
/

Qwen3-VL-Embedding-8B

@@ -114,33 +114,59 @@ class Qwen3VLForEmbedding(Qwen3VLPreTrainedModel):
             attention_mask=attention_mask,
         )
 # Define embedder class for processing inputs and generating embeddings
 class Qwen3VLEmbedder():
-    def __init__(self, model_name_or_path: str, max_length: int = MAX_LENGTH,
-                 instruction: Optional[str] = None, **kwargs):
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.max_length = max_length
-        self.instruction = instruction or "Represent the user's input."
-        # Set pixel and frame configurations
-        self.min_pixels = kwargs.pop('min_pixels', MIN_PIXELS)
-        self.max_pixels = kwargs.pop('max_pixels', MAX_PIXELS)
-        self.total_pixels = kwargs.pop('total_pixels', MAX_TOTAL_PIXELS)
-        self.fps = kwargs.pop('fps', FPS)
-        self.num_frames = kwargs.pop('num_frames', MAX_FRAMES)
-        self.max_frames = kwargs.pop('max_frames', MAX_FRAMES)
-        # Initialize model and processor
         self.model = Qwen3VLForEmbedding.from_pretrained(
             model_name_or_path, trust_remote_code=True, **kwargs
         ).to(device)
         self.processor = Qwen3VLProcessor.from_pretrained(
             model_name_or_path, padding_side='right'
         )
-        # Define padding token id
-        self.model.eval()  # Set model to evaluation mode
-    # Forward pass for the embedder model
     @torch.no_grad()
     def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
         outputs = self.model(**inputs)
@@ -149,25 +175,6 @@ class Qwen3VLEmbedder():
             'attention_mask': inputs.get('attention_mask')
         }
-    # Sample frames from video files
-    def _sample_frames(self, frames: List[str], num_segments: int, max_segments: int) -> List[str]:
-        duration = len(frames)
-        frame_id_array = np.linspace(0, duration - 1, num_segments, dtype=int)
-        frame_id_list = frame_id_array.tolist()
-        last_frame_id = frame_id_list[-1]
-        # Create a list of sampled frames
-        sampled_frames = []
-        for frame_idx in frame_id_list:
-            try:
-                sampled_frames.append(frames[frame_idx])
-            except:
-                break
-        # Ensure the sampled list meets the required segment count
-        while len(sampled_frames) < num_segments:
-            sampled_frames.append(frames[last_frame_id])
-        return sampled_frames[:max_segments]
     # Truncate token sequence to a specified max length
     def _truncate_tokens(self, token_ids: List[int], max_length: int) -> List[int]:
         if len(token_ids) <= max_length:
@@ -189,12 +196,14 @@ class Qwen3VLEmbedder():
         return final_token_ids
     # Format input based on provided text, image, video, and instruction
-    def format_model_input(self, text: Optional[str] = None,
-                           image: Optional[Union[str, Image.Image]] = None,
-                           video: Optional[Union[str, List[str]]] = None,
-                           instruction: Optional[str] = None,
-                           fps: Optional[float] = None,
-                           max_frames: Optional[int] = None) -> List[Dict]:
         # Ensure instruction ends with punctuation
         if instruction:
@@ -205,35 +214,37 @@ class Qwen3VLEmbedder():
         # Initialize conversation with system prompts
         content = []
         conversation = [
-            {"role": "system", "content": [{"type": "text", "text": instruction or self.instruction}]},
             {"role": "user", "content": content}
         ]
         # Add text, image, or video content to conversation
         if not text and not image and not video:
-            content.append({'type': 'text', 'text': ""})
             return conversation
         if video:
             video_content = None
             if isinstance(video, list):
                 video_content = video
                 if self.num_frames is not None or self.max_frames is not None:
-                    video_content = self._sample_frames(video_content, self.num_frames, self.max_frames)
-                video_content = ['file://' + ele for ele in video_content]
             elif isinstance(video, str):
-                video_content = video if video.startswith(('http', 'oss')) else 'file://' + video
             else:
-                video_content = video
             # Add video input details to content
             if video_content:
                 content.append({
                     'type': 'video', 'video': video_content,
-                    'total_pixels': self.total_pixels,
-                    'max_frames': max_frames or self.max_frames,
-                    'fps': fps or self.fps,
-                    'sample_fps': fps or self.fps,
                 })
         if image:
@@ -243,7 +254,7 @@ class Qwen3VLEmbedder():
             elif isinstance(image, str):
                 image_content = image if image.startswith(('http', 'oss')) else 'file://' + image
             else:
-                image_content = image
             # Add image input details to content
             if image_content:
@@ -270,7 +281,7 @@ class Qwen3VLEmbedder():
                 return_video_metadata=True, return_video_kwargs=True
             )
         except Exception as e:
-            logger.warning(f"Error in processing vision info: {e}")
             images = None
             video_inputs = None
             video_kwargs = {'do_sample_frames': False}
@@ -323,4 +334,4 @@ class Qwen3VLEmbedder():
         if normalize:
             embeddings = F.normalize(embeddings, p=2, dim=-1)
-        return embeddings

             attention_mask=attention_mask,
         )
+def sample_frames(frames: List[Union[str, Image.Image]], num_segments: int, max_segments: int) -> List[str]:
+    duration = len(frames)
+    frame_id_array = np.linspace(0, duration - 1, num_segments, dtype=int)
+    frame_id_list = frame_id_array.tolist()
+    last_frame_id = frame_id_list[-1]
+    # Create a list of sampled frames
+    sampled_frames = []
+    for frame_idx in frame_id_list:
+        try:
+            sampled_frames.append(frames[frame_idx])
+        except:
+            break
+    # Ensure the sampled list meets the required segment count
+    while len(sampled_frames) < num_segments:
+        sampled_frames.append(frames[last_frame_id])
+    return sampled_frames[:max_segments]
 # Define embedder class for processing inputs and generating embeddings
 class Qwen3VLEmbedder():
+    def __init__(
+        self,
+        model_name_or_path: str,
+        max_length: int = MAX_LENGTH,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        total_pixels: int = MAX_TOTAL_PIXELS,
+        fps: float = FPS,
+        num_frames: int = MAX_FRAMES,
+        max_frames: int = MAX_FRAMES,
+        default_instruction: str = "Represent the user's input.",
+        **kwargs
+    ):
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.max_length = max_length
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.total_pixels = total_pixels
+        self.fps = fps
+        self.num_frames = num_frames
+        self.max_frames = max_frames
+        self.default_instruction = default_instruction
         self.model = Qwen3VLForEmbedding.from_pretrained(
             model_name_or_path, trust_remote_code=True, **kwargs
         ).to(device)
         self.processor = Qwen3VLProcessor.from_pretrained(
             model_name_or_path, padding_side='right'
         )
+        self.model.eval()
     @torch.no_grad()
     def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
         outputs = self.model(**inputs)
             'attention_mask': inputs.get('attention_mask')
         }
     # Truncate token sequence to a specified max length
     def _truncate_tokens(self, token_ids: List[int], max_length: int) -> List[int]:
         if len(token_ids) <= max_length:
         return final_token_ids
     # Format input based on provided text, image, video, and instruction
+    def format_model_input(
+        self, text: Optional[str] = None,
+        image: Optional[Union[str, Image.Image]] = None,
+        video: Optional[Union[str, List[Union[str, Image.Image]]]] = None,
+        instruction: Optional[str] = None,
+        fps: Optional[float] = None,
+        max_frames: Optional[int] = None
+    ) -> List[Dict]:
         # Ensure instruction ends with punctuation
         if instruction:
         # Initialize conversation with system prompts
         content = []
         conversation = [
+            {"role": "system", "content": [{"type": "text", "text": instruction or self.default_instruction}]},
             {"role": "user", "content": content}
         ]
         # Add text, image, or video content to conversation
         if not text and not image and not video:
+            content.append({'type': 'text', 'text': "NULL"})
             return conversation
         if video:
             video_content = None
+            video_kwargs = { 'total_pixels': self.total_pixels }
             if isinstance(video, list):
                 video_content = video
                 if self.num_frames is not None or self.max_frames is not None:
+                    video_content = sample_frames(video_content, self.num_frames, self.max_frames)
+                video_content = [
+                    ('file://' + ele if isinstance(ele, str) else ele)
+                    for ele in video_content
+                ]
             elif isinstance(video, str):
+                video_content = video if video.startswith(('http://', 'https://')) else 'file://' + video
+                video_kwargs = {'fps': fps or self.fps, 'max_frames': max_frames or self.max_frames,}
             else:
+                raise TypeError(f"Unrecognized video type: {type(video)}")
             # Add video input details to content
             if video_content:
                 content.append({
                     'type': 'video', 'video': video_content,
+                    **video_kwargs
                 })
         if image:
             elif isinstance(image, str):
                 image_content = image if image.startswith(('http', 'oss')) else 'file://' + image
             else:
+                raise TypeError(f"Unrecognized image type: {type(image)}")
             # Add image input details to content
             if image_content:
                 return_video_metadata=True, return_video_kwargs=True
             )
         except Exception as e:
+            logger.error(f"Error in processing vision info: {e}")
             images = None
             video_inputs = None
             video_kwargs = {'do_sample_frames': False}
         if normalize:
             embeddings = F.normalize(embeddings, p=2, dim=-1)
+        return embeddings