| | |
| | |
| | |
| | |
| | |
| |
|
| | import torch |
| | import torch.nn as nn |
| | import torchvision.transforms as T |
| | from torchvision.transforms import InterpolationMode |
| | from transformers import LlamaTokenizer |
| |
|
| | from .configuration_intern_vit import InternVisionConfig |
| | from .configuration_internvl import InternVLConfig |
| | from .modeling_intern_vit import InternVisionModel |
| | from .modeling_internvl import InternVL_C, InternVL_G, InternVLModel |
| |
|
| | __all__ = ['InternVisionConfig', 'InternVisionModel', 'InternVLConfig', |
| | 'InternVLModel', 'InternVL_C', 'InternVL_G'] |
| |
|
| |
|
| | |
| | class InternVLTokenizer(nn.Module): |
| | def __init__(self, model_path): |
| | super(InternVLTokenizer, self).__init__() |
| | self.tokenizer = LlamaTokenizer.from_pretrained(model_path) |
| | self.tokenizer.pad_token = ' ' |
| | self.tokenizer.add_eos_token = True |
| |
|
| | def forward(self, text, prefix='summarize:'): |
| | if type(text) == str: |
| | text = prefix + text |
| | elif type(text) == list: |
| | text = [prefix + item for item in text] |
| | text = self.tokenizer(text, return_tensors='pt', max_length=80, truncation=True, padding='max_length').input_ids |
| | return text |
| |
|
| |
|
| | def build_transform(task, image_size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]): |
| | if task == 'retrieval': |
| | transform = T.Compose([ |
| | T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), |
| | T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), |
| | T.ToTensor(), |
| | T.Normalize(mean=mean, std=std)]) |
| | else: |
| | transform = T.Compose([ |
| | T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), |
| | T.Resize(image_size, interpolation=InterpolationMode.BICUBIC), |
| | T.CenterCrop(image_size), |
| | T.ToTensor(), |
| | T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) |
| | return transform |
| |
|
| |
|
| | def load_internvl_c_huggingface(ckpt_path, device, task): |
| | model = InternVL_C.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device) |
| | if model.config.use_backbone_lora: |
| | model.vision_model.merge_and_unload() |
| | model.vision_model = model.vision_model.model |
| | if model.config.use_qllama_lora: |
| | model.qllama.merge_and_unload() |
| | model.qllama = model.qllama.model |
| | if model.config.force_image_size is not None: |
| | image_size = model.config.force_image_size |
| | else: |
| | image_size = model.config.vision_config.image_size |
| | transform = build_transform(task, image_size) |
| | tokenizer = InternVLTokenizer(ckpt_path) |
| | return model, transform, tokenizer |
| |
|
| |
|
| | def load_internvl_g_huggingface(ckpt_path, device, task): |
| | model = InternVL_G.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device) |
| | if model.config.use_backbone_lora: |
| | model.vision_model.merge_and_unload() |
| | model.vision_model = model.vision_model.model |
| | if model.config.use_qllama_lora: |
| | model.qllama.merge_and_unload() |
| | model.qllama = model.qllama.model |
| | if model.config.force_image_size is not None: |
| | image_size = model.config.force_image_size |
| | else: |
| | image_size = model.config.vision_config.image_size |
| | transform = build_transform(task, image_size) |
| | tokenizer = InternVLTokenizer(ckpt_path) |
| | return model, transform, tokenizer |
| |
|