PuTorch commited on Nov 20, 2025

Commit

6ffba01

verified ·

1 Parent(s): 618c461

upload CondRef-AR model

Browse files

Files changed (22) hide show

.gitattributes +5 -0
CondRefAR/models/dinov2_adapter.py +36 -0
CondRefAR/models/generate.py +205 -0
CondRefAR/models/gpt_t2i.py +588 -0
CondRefAR/pipeline.py +112 -0
CondRefAR/tokenizer/vq_model.py +425 -0
CondRefAR/utils/drop_path.py +36 -0
README.md +75 -0
app.py +100 -0
assets/aerial_img.gif +3 -0
assets/control_img.gif +3 -0
assets/evolution.png +3 -0
assets/examples/example1.jpg +0 -0
assets/examples/example2.jpg +0 -0
assets/method.jpg +3 -0
assets/samples.png +3 -0
configs/gpt_config.json +12 -0
configs/vq_config.json +9 -0
requirements.txt +7 -0
sample.py +26 -0
weights/sketch-gpt-xl.safetensors +3 -0
weights/vq-16.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/aerial_img.gif filter=lfs diff=lfs merge=lfs -text
+assets/control_img.gif filter=lfs diff=lfs merge=lfs -text
+assets/evolution.png filter=lfs diff=lfs merge=lfs -text
+assets/method.jpg filter=lfs diff=lfs merge=lfs -text
+assets/samples.png filter=lfs diff=lfs merge=lfs -text

CondRefAR/models/dinov2_adapter.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from transformers import AutoImageProcessor, AutoModel
+from PIL import Image
+import requests
+import torch
+import torch.nn as nn
+class Dinov2_Adapter(nn.Module):
+    def __init__(self, input_dim=1, output_dim=768, attention=False, pool=False, nheads=8, dropout=0.1, adapter_size='small', condition_type='canny'):
+        super(Dinov2_Adapter, self).__init__()
+        print(f"Choose adapter size: {adapter_size}")
+        print(f"condition type: {condition_type}")
+        self.model = AutoModel.from_pretrained('D:\\Alps\\Aerial\\Code\\ControlRAR\\checkpoints\\dinov2\\')
+        self.condition_type = condition_type
+    def to_patch14(self, input):
+        H, W = input.shape[2:]
+        new_H = (H // 16) * 14
+        new_W = (W // 16) * 14
+        if self.condition_type in ['canny', 'seg']:
+            output = torch.nn.functional.interpolate(input, size=(new_H, new_W), mode='nearest')#, align_corners=True)  canny, seg
+        else:
+            output = torch.nn.functional.interpolate(input, size=(new_H, new_W), mode='bicubic', align_corners=True) # depth, lineart, hed
+        return output
+    def forward(self, x):
+        x = self.to_patch14(x)
+        x = self.model(x)
+        return x.last_hidden_state[:, 1:]
+if __name__ == '__main__':
+    model = Dinov2_Adapter().cuda()
+    inputs = torch.randn(4,3,512,512).cuda()
+    outputs = model(inputs)
+    print(outputs.shape)

CondRefAR/models/generate.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Modified from:
+#   gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py
+#   DiT:      https://github.com/facebookresearch/DiT/blob/main/models.py
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import torch._dynamo.config
+import torch._inductor.config
+import copy
+import time
+import pdb
+# torch._inductor.config.coordinate_descent_tuning = True
+# torch._inductor.config.triton.unique_kernel_names = True
+# torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
+### from https://huggingface.co/transformers/v3.2.0/_modules/transformers/generation_utils.html
+def top_k_top_p_filtering(
+    logits,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        # import pdb;pdb.set_trace()
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+def sample(logits, temperature: float=1.0, top_k: int=2000, top_p: float=1.0, sample_logits=True):
+    logits = logits[:, -1, :] / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    probs = F.softmax(logits, dim=-1)
+    # values, indices = torch.max(probs, dim=1, keepdim=True)
+    # mask = (probs == values).float()
+    # probs = probs * (1 - mask)
+    # values, indices = torch.max(probs, dim=1, keepdim=True)
+    # mask = (probs == values).float()
+    # probs = probs * (1 - mask)
+    if sample_logits:
+        idx = torch.multinomial(probs, num_samples=1)
+    else:
+        _, idx = torch.topk(probs, k=1, dim=-1)
+    return idx, probs
+def logits_to_probs(logits, temperature: float = 1.0, top_p: float=1.0, top_k: int = None, **kwargs):
+    logits = logits / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+def prefill(model, cond_idx: torch.Tensor, input_pos: torch.Tensor, cfg_scale: float, condition:torch.Tensor, control_strength: float=1, **sampling_kwargs):
+    if cfg_scale > 1.0:
+        logits, _ = model(None, cond_idx, input_pos, condition=condition, control_strength=control_strength)
+        logits_combined = logits
+        cond_logits, uncond_logits = torch.split(logits_combined, len(logits_combined) // 2, dim=0)
+        logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+    else:
+        logits, _ = model(None, cond_idx, input_pos, condition=condition)
+    return sample(logits, **sampling_kwargs)[0]
+def decode_one_token(model, x: torch.Tensor, input_pos: torch.Tensor, cfg_scale: float, cfg_flag: bool, condition: torch.Tensor,  **sampling_kwargs):
+    assert input_pos.shape[-1] == 1
+    if cfg_scale > 1.0:
+        x_combined = torch.cat([x, x])
+        logits, _ = model(x_combined, cond_idx=None, input_pos=input_pos, condition=condition)
+        logits_combined = logits
+        cond_logits, uncond_logits = torch.split(logits_combined, len(logits_combined) // 2, dim=0)
+        if cfg_flag:
+            logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+        else:
+            logits = cond_logits
+    else:
+        logits, _ = model(x, cond_idx=None, input_pos=input_pos, condition=None)
+    return sample(logits, **sampling_kwargs)
+def decode_n_tokens(
+    model, cur_token: torch.Tensor, input_pos: torch.Tensor, num_new_tokens: int,
+    cfg_scale: float, cfg_interval: int, condition: torch.Tensor,
+    **sampling_kwargs):
+    new_tokens, new_probs = [], []
+    cfg_flag = True
+    for i in range(num_new_tokens):
+        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): # Actually better for Inductor to codegen attention here
+            if cfg_interval > -1 and i > cfg_interval:
+                cfg_flag = False
+            next_token, next_prob = decode_one_token(
+                model, cur_token, input_pos, cfg_scale, cfg_flag, condition=condition, **sampling_kwargs
+            )
+            input_pos += 1
+            new_tokens.append(next_token.clone())
+            new_probs.append(next_prob.clone())
+            cur_token = next_token.view(-1, 1)
+    return new_tokens, new_probs
+@torch.no_grad()
+def generate(model, cond, max_new_tokens, emb_masks=None, cfg_scale=1.0, cfg_interval=-1, condition=None, condition_null=None, condition_token_nums=0, control_strength=1, **sampling_kwargs):
+    if condition is not None:
+        condition = model.adapter(condition)
+        condition = model.adapter_mlp(condition)
+    if model.model_type == 'c2i':
+        if cfg_scale > 1.0:
+            cond_null = torch.ones_like(cond) * model.num_classes
+            cond_combined = torch.cat([cond, cond_null])
+            if condition is not None:
+                condition_null = torch.zeros_like(condition)
+                condition_combined = torch.cat((condition, condition_null), dim=0)
+            else:
+                condition_combined = None
+        else:
+            cond_combined = cond
+            if condition is not None:
+                condition_combined = condition
+            else:
+                condition_combined = None
+        T = 1+condition_token_nums
+    elif model.model_type == 't2i':
+        if cfg_scale > 1.0:
+            cond_null = torch.zeros_like(cond) + model.cls_embedding.uncond_embedding
+            cond_combined = torch.cat([cond, cond_null])
+            if condition is not None:
+                condition_null = torch.zeros_like(condition)
+                condition_combined = torch.cat((condition, condition_null), dim=0)
+            else:
+                condition_combined = None
+        else:
+            cond_combined = cond
+            if condition is not None:
+                condition_combined = condition
+            else:
+                condition_combined = None
+        T = cond.shape[1]
+    else:
+        raise Exception("please check model type")
+    T_new = T + max_new_tokens
+    max_seq_length = T_new
+    max_batch_size = cond.shape[0]
+    device = cond.device
+    with torch.device(device):
+        max_batch_size_cfg = max_batch_size * 2 if cfg_scale > 1.0 else max_batch_size
+        model.setup_caches(max_batch_size=max_batch_size_cfg, max_seq_length=max_seq_length, dtype=model.tok_embeddings.weight.dtype)
+    if emb_masks is not None:
+        assert emb_masks.shape[0] == max_batch_size
+        assert emb_masks.shape[-1] == T
+        if cfg_scale > 1.0:
+            model.causal_mask[:, :, :T] = model.causal_mask[:, :, :T] * torch.cat([emb_masks, emb_masks]).unsqueeze(1)
+        else:
+            model.causal_mask[:, :, :T] = model.causal_mask[:, :, :T] * emb_masks.unsqueeze(1)
+        eye_matrix = torch.eye(model.causal_mask.size(1), model.causal_mask.size(2), device=device)
+        model.causal_mask[:] = model.causal_mask * (1 - eye_matrix) + eye_matrix
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    seq = torch.empty((max_batch_size, T_new), dtype=torch.int, device=device)
+    input_pos = torch.arange(0, T, device=device)
+    next_token = prefill(model, cond_combined, input_pos, cfg_scale, condition_combined, control_strength, **sampling_kwargs)
+    seq[:, T:T+1] = next_token
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    generated_tokens, _ = decode_n_tokens(model, next_token, input_pos, max_new_tokens-1, cfg_scale, cfg_interval, condition=condition_combined, **sampling_kwargs)
+    seq[:, T+1:] = torch.cat(generated_tokens, dim=1)
+    return seq[:, T:]

CondRefAR/models/gpt_t2i.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# Modified from:
+#   VQGAN:    https://github.com/CompVis/taming-transformers/blob/master/taming/modules/transformer/mingpt.py
+#   DiT:      https://github.com/facebookresearch/DiT/blob/main/models.py
+#   nanoGPT:  https://github.com/karpathy/nanoGPT/blob/master/model.py
+#   llama:    https://github.com/facebookresearch/llama/blob/main/llama/model.py
+#   gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/model.py
+#   PixArt:   https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+from dataclasses import dataclass
+from typing import Optional, List
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from ..utils.drop_path import DropPath
+from .dinov2_adapter import Dinov2_Adapter
+def get_causal_mask(seq_length):
+    mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).type(torch.bool)
+    mask = mask.masked_fill(mask, float('-inf'))
+    mask = mask.masked_fill(~mask, float(0.0))
+    return mask
+def find_multiple(n: int, k: int):
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layer: int = 32
+    n_head: int = 32
+    n_kv_head: Optional[int] = None
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    initializer_range: float = 0.02
+    token_dropout_p: float = 0.1
+    attn_dropout_p: float = 0.0
+    resid_dropout_p: float = 0.1
+    ffn_dropout_p: float = 0.1
+    drop_path_rate: float = 0.0
+    num_classes: int = 1000
+    caption_dim: int = 2048
+    class_dropout_prob: float = 0.1
+    model_type: str = 'c2i'
+    vocab_size: int = 16384
+    cls_token_num: int = 1
+    block_size: int = 256
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+    adapter_size: str = 'small'
+    condition_type: str = 'canny'
+#################################################################################
+#                      Embedding Layers for Class Labels                        #
+#################################################################################
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels, drop_ids
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels,drop_ids = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels).unsqueeze(1)
+        if (train and use_dropout) or (force_drop_ids is not None):
+            return embeddings,drop_ids
+        else:
+            return embeddings
+class ConditionEmbedder(nn.Module):
+    """
+    Embeds Condition into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, token_num=120, vocab_size=16384):
+        super().__init__()
+        self.cap_proj = MLP(in_features=hidden_size, hidden_features=hidden_size, out_features=hidden_size)
+        self.register_buffer("uncond_embedding", torch.zeros(token_num, hidden_size) / hidden_size ** 0.5)
+        self.uncond_prob = uncond_prob
+    def token_drop(self, caption, force_drop_ids=None, drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            if drop_ids is None:
+                drop_ids = torch.rand(caption.shape[0], device=caption.device) < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        caption = torch.where(drop_ids[:, None, None], self.uncond_embedding[:caption.shape[1]], caption)
+        return caption
+    def forward(self, caption, train, force_drop_ids=None, drop_ids=None):
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            caption = self.token_drop(caption, force_drop_ids, drop_ids)
+        embeddings = self.cap_proj(caption)
+        return embeddings
+#################################################################################
+#                      Embedding Layers for Text Feature                        #
+#################################################################################
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds text caption into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, token_num=120):
+        super().__init__()
+        self.cap_proj = MLP(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size)
+        self.register_buffer("uncond_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
+        self.uncond_prob = uncond_prob
+    def token_drop(self, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(caption.shape[0], device=caption.device) < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        caption = torch.where(drop_ids[:, None, None], self.uncond_embedding, caption)
+        return caption, drop_ids
+    def forward(self, caption, train, force_drop_ids=None):
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            caption, drop_ids = self.token_drop(caption, force_drop_ids)
+        embeddings = self.cap_proj(caption)
+        if (train and use_dropout) or (force_drop_ids is not None):
+            return embeddings, drop_ids
+        else:
+            return embeddings
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=False)
+        self.act = nn.GELU(approximate='tanh')
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=False)
+        nn.init.zeros_(self.fc1.weight)
+        nn.init.zeros_(self.fc2.weight)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+#################################################################################
+#                                  GPT Model                                    #
+#################################################################################
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        hidden_dim = 4 * config.dim
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if config.ffn_dim_multiplier is not None:
+            hidden_dim = int(config.ffn_dim_multiplier * hidden_dim)
+        hidden_dim = find_multiple(hidden_dim, config.multiple_of)
+        self.w1 = nn.Linear(config.dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(config.dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, config.dim, bias=False)
+        self.ffn_dropout = nn.Dropout(config.ffn_dropout_p)
+    def forward(self, x):
+        return self.ffn_dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+class KVCache(nn.Module):
+    def __init__(self, max_batch_size, max_seq_length, n_head, head_dim, dtype):
+        super().__init__()
+        cache_shape = (max_batch_size, n_head, max_seq_length, head_dim)
+        self.register_buffer('k_cache', torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer('v_cache', torch.zeros(cache_shape, dtype=dtype))
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+        return k_out, v_out
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        self.dim = config.dim
+        self.head_dim = config.dim // config.n_head
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head if config.n_kv_head is not None else config.n_head
+        total_kv_dim = (self.n_head + 2 * self.n_kv_head) * self.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_kv_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+        # regularization
+        self.attn_dropout_p = config.attn_dropout_p
+        self.resid_dropout = nn.Dropout(config.resid_dropout_p)
+    def forward(
+        self, x: torch.Tensor, freqs_cis: torch.Tensor = None,
+        input_pos: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None
+    ):
+        bsz, seqlen, _ = x.shape
+        kv_size = self.n_kv_head * self.head_dim
+        xq, xk, xv = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+        xq = xq.view(bsz, seqlen, self.n_head, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_kv_head, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_kv_head, self.head_dim)
+        xq = apply_rotary_emb(xq, freqs_cis)
+        xk = apply_rotary_emb(xk, freqs_cis)
+        xq, xk, xv = map(lambda x: x.transpose(1, 2), (xq, xk, xv))
+        if self.kv_cache is not None:
+            keys, values = self.kv_cache.update(input_pos, xk, xv)
+        else:
+            keys, values = xk, xv
+        keys = keys.repeat_interleave(self.n_head // self.n_kv_head, dim=1)
+        values = values.repeat_interleave(self.n_head // self.n_kv_head, dim=1)
+        output = F.scaled_dot_product_attention(
+            xq, keys, values,
+            attn_mask=mask,
+            is_causal=True if mask is None else False, # is_causal=False is for KV cache
+            dropout_p=self.attn_dropout_p if self.training else 0)
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+        output = self.resid_dropout(self.wo(output))
+        return output
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs, drop_path: float):
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.attention_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(
+        self, x: torch.Tensor, freqs_cis: torch.Tensor, start_pos: int, mask: Optional[torch.Tensor] = None):
+        h = x + self.drop_path(self.attention(self.attention_norm(x), freqs_cis, start_pos, mask))
+        out = h + self.drop_path(self.feed_forward(self.ffn_norm(h)))
+        return out
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.n_layer = config.n_layer
+        self.block_size = config.block_size
+        self.num_classes = config.num_classes
+        self.model_type = config.model_type
+        self.cls_token_num = config.cls_token_num
+        self.layer_internal = config.n_layer // 3
+        # self.adapter = Adapter(output_dim=768)
+        # self.adapter = ViT_Adapter()
+        # self.adapter = DeiT_Adapter()
+        self.adapter = Dinov2_Adapter(adapter_size=config.adapter_size, condition_type=config.condition_type)
+        # self.adapter = EVA_Adapter()
+        if config.adapter_size == "small":
+            self.adapter_mlp = MLP(384, config.dim, config.dim)
+        elif config.adapter_size == 'base':
+            self.adapter_mlp = MLP(768, config.dim, config.dim)
+        if self.model_type == 'c2i':
+            self.cls_embedding = LabelEmbedder(config.num_classes, config.dim, config.class_dropout_prob)
+        elif self.model_type == 't2i':
+            self.cls_embedding = CaptionEmbedder(config.caption_dim, config.dim, config.class_dropout_prob)
+        else:
+            raise Exception("please check model type")
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.tok_dropout = nn.Dropout(config.token_dropout_p)
+        self.condition_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.condition_mlp = ConditionEmbedder(self.block_size, config.dim, config.class_dropout_prob, self.block_size, config.vocab_size)
+        self.condition_layers = torch.nn.ModuleList()
+        for layer_id in range(3):
+            self.condition_layers.append(MLP(config.dim,config.dim,config.dim))
+        # transformer blocks
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.n_layer)]
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(config.n_layer):
+            self.layers.append(TransformerBlock(config, dpr[layer_id]))
+        # output layer
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        # 2d rotary pos embedding
+        grid_size = int(self.block_size ** 0.5)
+        assert grid_size * grid_size == self.block_size
+        self.freqs_cis = precompute_freqs_cis_2d(grid_size, self.config.dim // self.config.n_head, self.config.rope_base, self.cls_token_num)
+        # KVCache
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+        self.initialize_weights()
+        self.condition_token = None
+        self.mask = get_causal_mask(256)
+        self.global_token = None
+        self.control_strength = 1
+    def initialize_weights(self):
+        # Initialize nn.Linear and nn.Embedding
+        self.apply(self._init_weights)
+        # Zero-out output layers:
+        nn.init.constant_(self.output.weight, 0)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+    def setup_caches(self, max_batch_size, max_seq_length, dtype):
+        # if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size:
+        #     return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)  #
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_head, head_dim, dtype)
+        causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool))
+        self.causal_mask = causal_mask.unsqueeze(0).repeat(self.max_batch_size, 1, 1)
+        grid_size = int(self.config.block_size ** 0.5)
+        assert grid_size * grid_size == self.block_size
+        self.freqs_cis = precompute_freqs_cis_2d(grid_size, self.config.dim // self.config.n_head, self.config.rope_base, self.cls_token_num)
+    def forward(
+        self,
+        idx: torch.Tensor,
+        cond_idx: torch.Tensor,  # cond_idx_or_embed
+        input_pos:  Optional[torch.Tensor] = None,
+        targets: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        valid: Optional[torch.Tensor] = None,
+        condition: Optional[torch.Tensor] = None,
+        control_strength: Optional[int] = 1
+    ):
+        if idx is not None and cond_idx is not None: # training or naive inference
+            cond_embeddings,drop_ids = self.cls_embedding(cond_idx, train=self.training)
+            cond_embeddings = cond_embeddings[:,:self.cls_token_num]
+            token_embeddings = self.tok_embeddings(idx)
+            if condition is not None:
+                condition_embeddings = self.adapter(condition)
+                condition_embeddings = self.adapter_mlp(condition_embeddings)
+                self.condition_token = self.condition_mlp(condition_embeddings,train=self.training, drop_ids=drop_ids)
+            token_embeddings = torch.cat((cond_embeddings, token_embeddings), dim=1)
+            h = self.tok_dropout(token_embeddings)
+            self.freqs_cis = self.freqs_cis.to(h.device)
+        else:
+            if cond_idx is not None: # prefill in inference
+                self.control_strength = control_strength
+                token_embeddings = self.cls_embedding(cond_idx, train=self.training)
+                token_embeddings = token_embeddings[:,:self.cls_token_num]
+                if condition is not None:
+                    condition_embeddings = self.condition_mlp(condition, train=self.training)#.to(torch.bfloat16),train=self.training)
+                    self.condition_token = condition_embeddings
+                    self.condition_token = [self.condition_layers[0](self.condition_token),
+                                            self.condition_layers[1](self.condition_token),
+                                            self.condition_layers[2](self.condition_token)]
+            else: # decode_n_tokens(kv cache) in inference
+                token_embeddings = self.tok_embeddings(idx)
+            bs = token_embeddings.shape[0]
+            mask = self.causal_mask[:bs, None, input_pos]
+            h = self.tok_dropout(token_embeddings)
+            self.freqs_cis = self.freqs_cis
+        if self.training:
+            freqs_cis = self.freqs_cis[:token_embeddings.shape[1]]
+        else:
+            freqs_cis = self.freqs_cis[input_pos]
+        # transformer blocks
+        for i, layer in enumerate(self.layers):
+            if i%self.layer_internal == 0:
+                if self.training:
+                    h[:, self.cls_token_num-1:] = h[:, self.cls_token_num-1:] + self.condition_layers[i//self.layer_internal](self.condition_token)
+                else:
+                    if len(input_pos)>1:
+                        # h[:, -1:] = h[:, -1:] + self.condition_layers[i//self.layer_internal](self.condition_token[:,0:1])
+                        h[:,-1:] = h[:, -1:] + self.control_strength*self.condition_token[i//self.layer_internal][:,0:1]
+                    else:
+                        # h = h + self.condition_layers[i//self.layer_internal](self.condition_token[:,input_pos-self.cls_token_num+1])
+                        h = h + self.control_strength*self.condition_token[i//self.layer_internal][:,input_pos-self.cls_token_num+1]
+            h = layer(h, freqs_cis, input_pos, mask)
+        # output layers
+        h = self.norm(h)
+        logits = self.output(h).float()
+        if self.training:
+            logits = logits[:, self.cls_token_num - 1:].contiguous()
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if valid is not None:
+            loss_all = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), reduction='none')
+            valid_all = valid[:,None].repeat(1, targets.shape[1]).view(-1)
+            loss = (loss_all * valid_all).sum() / max(valid_all.sum(), 1)
+        elif targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+    def get_fsdp_wrap_module_list(self) -> List[nn.Module]:
+        return list(self.layers)
+#################################################################################
+#                      Rotary Positional Embedding Functions                    #
+#################################################################################
+# https://github.com/pytorch-labs/gpt-fast/blob/main/model.py
+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000, cls_token_num=120):
+    freqs = 1.0 / (base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem))
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs) # (seq_len, head_dim // 2)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1) # (cls_token_num+seq_len, head_dim // 2, 2)
+    cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache]) # (cls_token_num+seq_len, head_dim // 2, 2)
+    return cond_cache
+def precompute_freqs_cis_2d(grid_size: int, n_elem: int, base: int = 10000, cls_token_num=120):
+    # split the dimension into half, one for x and one for y
+    half_dim = n_elem // 2
+    freqs = 1.0 / (base ** (torch.arange(0, half_dim, 2)[: (half_dim // 2)].float() / half_dim))
+    t = torch.arange(grid_size, device=freqs.device)
+    freqs = torch.outer(t, freqs) # (grid_size, head_dim // 2)
+    freqs_grid = torch.concat([
+        freqs[:, None, :].expand(-1, grid_size, -1),
+        freqs[None, :, :].expand(grid_size, -1, -1),
+    ], dim=-1)  # (grid_size, grid_size, head_dim // 2)
+    cache_grid = torch.stack([torch.cos(freqs_grid), torch.sin(freqs_grid)], dim=-1) # (grid_size, grid_size, head_dim // 2, 2)
+    cache = cache_grid.flatten(0, 1)
+    cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache]) # (cls_token_num+grid_size**2, head_dim // 2, 2)
+    return cond_cache
+def precompute_freqs_cis_2d_new(grid_size: int, n_elem: int, base: int = 10000, cls_token_num=120, spe_token_num=3, ar_token_num=4):
+    # split the dimension into half, one for x and one for y
+    half_dim = n_elem // 2
+    freqs = 1.0 / (base ** (torch.arange(0, half_dim, 2)[: (half_dim // 2)].float() / half_dim))
+    t = torch.arange(grid_size, device=freqs.device)
+    freqs = torch.outer(t, freqs) # (grid_size, head_dim // 2)
+    freqs_grid = torch.concat([
+        freqs[:, None, :].expand(-1, grid_size, -1),
+        freqs[None, :, :].expand(grid_size, -1, -1),
+    ], dim=-1)  # (grid_size, grid_size, head_dim // 2)
+    cache_grid = torch.stack([torch.cos(freqs_grid), torch.sin(freqs_grid)], dim=-1) # (grid_size, grid_size, head_dim // 2, 2)
+    sub_num = int(ar_token_num**0.5)
+    cache_grid = cache_grid.reshape(sub_num, grid_size//sub_num, sub_num, grid_size//sub_num, half_dim, 2)
+    cache_grid = cache_grid.permute(1, 3, 0, 2, 4, 5)
+    cache = cache_grid.flatten(0, 3)
+    cache_one, cache_two = cache[:ar_token_num], cache[ar_token_num:]
+    sep_cache = torch.zeros(spe_token_num, n_elem // 2, 2)
+    cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache_one, sep_cache, cache_two])
+    # cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache]) # (cls_token_num+grid_size**2, head_dim // 2, 2)
+    return cond_cache
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor):
+    # x: (bs, seq_len, n_head, head_dim)
+    # freqs_cis (seq_len, head_dim // 2, 2)
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2) # (bs, seq_len, n_head, head_dim//2, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2) # (1, seq_len, 1, head_dim//2, 2)
+    x_out2 = torch.stack([
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+    ], dim=-1)
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
+#################################################################################
+#                                GPT Configs                                    #
+#################################################################################
+### text-conditional
+def GPT_7B(**kwargs):
+    return Transformer(ModelArgs(n_layer=32, n_head=32, dim=4096, **kwargs)) # 6.6B
+def GPT_3B(**kwargs):
+    return Transformer(ModelArgs(n_layer=24, n_head=32, dim=3200, **kwargs)) # 3.1B
+def GPT_1B(**kwargs):
+    return Transformer(ModelArgs(n_layer=22, n_head=32, dim=2048, **kwargs)) # 1.2B
+### class-conditional
+def GPT_XXXL(**kwargs):
+    return Transformer(ModelArgs(n_layer=48, n_head=40, dim=2560, **kwargs)) # 3.9B
+def GPT_XXL(**kwargs):
+    return Transformer(ModelArgs(n_layer=48, n_head=24, dim=1536, **kwargs)) # 1.4B
+def GPT_XL(**kwargs):
+    return Transformer(ModelArgs(n_layer=36, n_head=20, dim=1280, **kwargs)) # 775M
+def GPT_L(**kwargs):
+    return Transformer(ModelArgs(n_layer=24, n_head=16, dim=1024, **kwargs)) # 343M
+def GPT_B(**kwargs):
+    return Transformer(ModelArgs(n_layer=12, n_head=12, dim=768, **kwargs)) # 111M
+GPT_models = {
+    'GPT-B': GPT_B, 'GPT-L': GPT_L, 'GPT-XL': GPT_XL, 'GPT-XXL': GPT_XXL, 'GPT-XXXL': GPT_XXXL,
+    'GPT-1B': GPT_1B, 'GPT-3B': GPT_3B, 'GPT-7B': GPT_7B,
+}

CondRefAR/pipeline.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import numpy as np
+from PIL import Image
+from safetensors.torch import load_file
+from .models.gpt_t2i import GPT_models
+from .models.generate import generate
+from .tokenizer.vq_model import VQ_models
+class CondRefARPipeline:
+    def __init__(self, device=None, torch_dtype=torch.bfloat16):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.dtype = torch_dtype
+        self.gpt = None
+        self.vq = None
+        self.image_size = None
+        self.downsample = None
+        self.n_q = 8
+    @classmethod
+    def from_pretrained(cls, repo_or_path, gpt_config, vq_config, gpt_weights="weights/sketch-gpt-xl.safetensors", vq_weights="weights/vq-16.safetensors", device=None, torch_dtype=torch.bfloat16):
+        pipe = cls(device=device, torch_dtype=torch_dtype)
+        # 1) VQ
+        pipe.downsample = int(vq_config["downsample_size"])
+        codebook_size = int(vq_config["codebook_size"])
+        codebook_embed_dim = int(vq_config["codebook_embed_dim"])
+        pipe.vq = VQ_models[vq_config.get("model_name", "VQ-16")](codebook_size=codebook_size, codebook_embed_dim=codebook_embed_dim)
+        vq_state = load_file(f"{repo_or_path}/{vq_weights}")
+        pipe.vq.load_state_dict(vq_state, strict=True)
+        pipe.vq.to(pipe.device)
+        pipe.vq.eval()
+        # 2) GPT
+        pipe.image_size = int(gpt_config["image_size"])
+        vocab_size = int(gpt_config["vocab_size"])
+        latent_size = pipe.image_size // pipe.downsample
+        block_size=latent_size ** 2
+        num_classes = int(gpt_config.get("num_classes", 1000))
+        cls_token_num = int(gpt_config.get("cls_token_num", 120))
+        model_type = gpt_config.get("model_type", "t2i")
+        adapter_size = gpt_config.get("adapter_size", "small")
+        condition_type = gpt_config.get("condition_type", "sketch")
+        pipe.gpt = GPT_models[gpt_config.get("gpt_name", "GPT-XL")](
+            vocab_size=vocab_size,
+            block_size=block_size,
+            num_classes=num_classes,
+            cls_token_num=cls_token_num,
+            model_type=model_type,
+            adapter_size=adapter_size,
+            condition_type=condition_type
+        ).to(device=pipe.device, dtype=pipe.dtype)
+        gpt_state = load_file(f"{repo_or_path}/{gpt_weights}")
+        pipe.gpt.load_state_dict(gpt_state, strict=False)
+        pipe.gpt.eval()
+        return pipe
+    @torch.inference_mode()
+    def __call__(self, prompt_emb, control_image, cfg_scale=4, cfg_interval=-1, temperature=1.0, top_k=2000, top_p=1.0):
+        """
+        prompt_emb: torch.Tensor [B, T_txt, D]
+        control_image: np.ndarray/PIL
+        Return: Image
+        """
+        # 预处理 control
+        if isinstance(control_image, Image.Image):
+            control_image = np.array(control_image.convert("RGB"))
+        if isinstance(control_image, np.ndarray):
+            # [H,W,C] uint8 -> [-1,1]
+            control_image = torch.from_numpy(control_image).permute(2,0,1).unsqueeze(0).float()
+            if control_image.max() > 1.0:
+                control_image = control_image / 255.0
+            control_image = 2.0 * (control_image - 0.5)
+        control = control_image.to(self.device, dtype=self.dtype)
+        # 文本嵌入
+        c_indices = prompt_emb.to(self.device, dtype=self.dtype)
+        # 这里的 emb_mask 若需要，可在外部构造后传入；为了最小示例，这里置 None
+        c_emb_masks = None
+        Hq = self.image_size // self.downsample
+        Wq = Hq
+        seq_len = Hq * Wq
+        # 采样 codebook 索引序列（generate 返回 [B, n_q*Hq*Wq] 或 [B, seq_len] 逐 codebook 生成）
+        index_sample = generate(
+            self.gpt, c_indices, seq_len, c_emb_masks,
+            condition=control, cfg_scale=cfg_scale, cfg_interval=cfg_interval,
+            temperature=temperature, top_k=top_k, top_p=top_p, sample_logits=True
+        )
+        # 重排 [B, n_q, Hq, Wq]
+        if index_sample.dim() == 2 and index_sample.shape[1] == self.n_q * Hq * Wq:
+            tokens = index_sample.view(index_sample.size(0), self.n_q, Hq, Wq).long()
+        elif index_sample.dim() == 2 and index_sample.shape[1] == Hq * Wq:
+            tokens = index_sample.view(index_sample.size(0), 1, Hq, Wq).long()
+        else:
+            # 尝试自动推断 n_q
+            n_q = max(1, index_sample.shape[1] // (Hq * Wq))
+            tokens = index_sample[:, : n_q * Hq * Wq].view(index_sample.size(0), n_q, Hq, Wq).long()
+        tokens = tokens.to(self.device)
+        qzshape = [tokens.size(0), 8, Hq, Wq]
+        samples = self.vq.decode_code(tokens, qzshape).detach().float().cpu()
+        # [-1,1] -> [0,1]
+        if samples.min() < -0.9:
+            samples = (samples + 1.0) / 2.0
+        samples = samples.clamp(0, 1)
+        imgs = []
+        arr = (samples * 255).to(torch.uint8).permute(0,2,3,1).numpy()
+        for i in range(arr.shape[0]):
+            imgs.append(Image.fromarray(arr[i]))
+        return imgs

CondRefAR/tokenizer/vq_model.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# Modified from:
+#   taming-transformers: https://github.com/CompVis/taming-transformers
+#   maskgit: https://github.com/google-research/maskgit
+from dataclasses import dataclass, field
+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+@dataclass
+class ModelArgs:
+    codebook_size: int = 16384
+    codebook_embed_dim: int = 8
+    codebook_l2_norm: bool = True
+    codebook_show_usage: bool = True
+    commit_loss_beta: float = 0.25
+    entropy_loss_ratio: float = 0.0
+    encoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    decoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    z_channels: int = 256
+    dropout_p: float = 0.0
+class VQModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(ch_mult=config.encoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p)
+        self.decoder = Decoder(ch_mult=config.decoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p)
+        self.quantize = VectorQuantizer(config.codebook_size, config.codebook_embed_dim,
+                                        config.commit_loss_beta, config.entropy_loss_ratio,
+                                        config.codebook_l2_norm, config.codebook_show_usage)
+        self.quant_conv = nn.Conv2d(config.z_channels, config.codebook_embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(config.codebook_embed_dim, config.z_channels, 1)
+    def encode(self, x):
+        #import pdb; pdb.set_trace()
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b, shape=None, channel_first=True):
+        quant_b = self.quantize.get_codebook_entry(code_b, shape, channel_first)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+class Encoder(nn.Module):
+    def __init__(self, in_channels=3, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2,
+                 norm_type='group', dropout=0.0, resamp_with_conv=True, z_channels=256):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.conv_in = nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1)
+        # downsampling
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.conv_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != self.num_resolutions-1:
+                conv_block.downsample = Downsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        h = self.conv_in(x)
+        # downsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.downsample(h)
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, z_channels=256, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2, norm_type="group",
+                 dropout=0.0, resamp_with_conv=True, out_channels=3):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+       # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        # upsampling
+        self.conv_blocks = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != 0:
+                conv_block.upsample = Upsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    def forward(self, z):
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        # upsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks + 1):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class VectorQuantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta, entropy_loss_ratio, l2_norm, show_usage):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.entropy_loss_ratio = entropy_loss_ratio
+        self.l2_norm = l2_norm
+        self.show_usage = show_usage
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        if self.l2_norm:
+            self.embedding.weight.data = F.normalize(self.embedding.weight.data, p=2, dim=-1)
+        if self.show_usage:
+            self.register_buffer("codebook_used", nn.Parameter(torch.zeros(65536)))
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = torch.einsum('b c h w -> b h w c', z).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        if self.l2_norm:
+            z = F.normalize(z, p=2, dim=-1)
+            z_flattened = F.normalize(z_flattened, p=2, dim=-1)
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(embedding**2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', z_flattened, torch.einsum('n d -> d n', embedding))
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = embedding[min_encoding_indices].view(z.shape)
+        perplexity = None
+        min_encodings = None
+        vq_loss = None
+        commit_loss = None
+        entropy_loss = None
+        codebook_usage = 0
+        if self.show_usage and self.training:
+            cur_len = min_encoding_indices.shape[0]
+            self.codebook_used[:-cur_len] = self.codebook_used[cur_len:].clone()
+            self.codebook_used[-cur_len:] = min_encoding_indices
+            codebook_usage = len(torch.unique(self.codebook_used)) / self.n_e
+        # compute loss for embedding
+        if self.training:
+            vq_loss = torch.mean((z_q - z.detach()) ** 2)
+            commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2)
+            entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        z_q = torch.einsum('b h w c -> b c h w', z_q)
+        return z_q, (vq_loss, commit_loss, entropy_loss, codebook_usage), (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape=None, channel_first=True):
+        # shape = (batch, channel, height, width) if channel_first else (batch, height, width, channel)
+        if self.l2_norm:
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        z_q = embedding[indices]  # (b*h*w, c)
+        if shape is not None:
+            if channel_first:
+                z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1])
+                # reshape back to match original input shape
+                z_q = z_q.permute(0, 3, 1, 2).contiguous()
+            else:
+                z_q = z_q.view(shape)
+        return z_q
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, norm_type='group'):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels, norm_type)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = Normalize(out_channels, norm_type)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, norm_type='group'):
+        super().__init__()
+        self.norm = Normalize(in_channels, norm_type)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels, norm_type='group'):
+    assert norm_type in ['group', 'batch']
+    if norm_type == 'group':
+        return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+    elif norm_type == 'batch':
+        return nn.SyncBatchNorm(in_channels)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01):
+    flat_affinity = affinity.reshape(-1, affinity.shape[-1])
+    flat_affinity /= temperature
+    probs = F.softmax(flat_affinity, dim=-1)
+    log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1)
+    if loss_type == "softmax":
+        target_probs = probs
+    else:
+        raise ValueError("Entropy loss {} not supported".format(loss_type))
+    avg_probs = torch.mean(target_probs, dim=0)
+    avg_entropy = - torch.sum(avg_probs * torch.log(avg_probs + 1e-5))
+    sample_entropy = - torch.mean(torch.sum(target_probs * log_probs, dim=-1))
+    loss = sample_entropy - avg_entropy
+    return loss
+#################################################################################
+#                              VQ Model Configs                                 #
+#################################################################################
+def VQ_8(**kwargs):
+    return VQModel(ModelArgs(encoder_ch_mult=[1, 2, 2, 4], decoder_ch_mult=[1, 2, 2, 4], **kwargs))
+def VQ_16(**kwargs):
+    return VQModel(ModelArgs(encoder_ch_mult=[1, 1, 2, 2, 4], decoder_ch_mult=[1, 1, 2, 2, 4], **kwargs))
+VQ_models = {'VQ-16': VQ_16, 'VQ-8': VQ_8}

CondRefAR/utils/drop_path.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# from timm.models.layers import DropPath
+import torch
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(torch.nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'

README.md CHANGED Viewed

@@ -1,3 +1,78 @@
 ---
 license: apache-2.0
 ---

 ---
+library_name: pytorch
+tags:
+  - autoregressive
+  - image-generation
+  - aerial
+  - controllable-generation
 license: apache-2.0
+pipeline_tag: image-to-image
 ---
+# CondRef-AR: Condition-as-a-Reference Randomized Autoregressive Modelling for Controllable Aerial Image Generation
+This repository contains the code and pretrained models for **CondRef-AR**, a controllable aerial image generation model using condition-as-a-reference randomized autoregressive modeling. The model generates high-quality aerial images based on input conditions such as sketches or segmentation maps.
+![CondRef-AR Overview](assets/method.jpg)
+## Quickstart
+```python
+import json, torch
+from CondRefAR.pipeline import CondRefARPipeline
+from transformers import AutoTokenizer, T5EncoderModel
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+gpt_cfg = json.load(open("configs/gpt_config.json"))
+vq_cfg  = json.load(open("configs/vq_config.json"))
+pipe = CondRefARPipeline.from_pretrained(".", gpt_cfg, vq_cfg, device=device, torch_dtype=dtype)
+tok = AutoTokenizer.from_pretrained("google/flan-t5-xl")
+enc = T5EncoderModel.from_pretrained("google/flan-t5-xl", torch_dtype=dtype).to(device).eval()
+prompt = "Aaerial view of a forested area with a river running through it. On the right side of the image, there is a small town or village with a red-roofed building."
+control = "assets/examples/example2.jpg"
+from PIL import Image, ImageOps
+control_img = Image.open(control).convert("RGB")
+inputs = tok([prompt], return_tensors="pt", padding="max_length", truncation=True, max_length=120)
+with torch.no_grad():
+    emb = enc(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device)).last_hidden_state
+imgs = pipe(emb, control_img, cfg_scale=4, temperature=1.0, top_k=2000, top_p=1.0)
+imgs[0].save("sample.png")
+```
+## Sample Results
+By varying the input conditions and prompts, CondRef-AR can generate diverse aerial images:
+![Samples](assets/samples.png)
+ConRef-AR can generate continuous, plausible, and high-resolution sequences of land-use change images based on a series of temporal semantic condition graphs. As shown in the figure below, the model successfully simulates the entire process—from a pristine forest gradually transforming into a modern residential urban area:
+![Temporal Generation](assets/evolution.png)
+<div align="center">
+| Control image | Aerial image |
+|---|---|
+| <img src="assets/control_img.gif" alt="control animation" width="100%"/> | <img src="assets/aerial_img.gif" alt="aerial animation" width="100%"/> |
+</div>
+## Files
+- `weights/sketch-gpt-xl.safetensors`, `weights/vq-16.safetensors`: pretrained weight
+- `configs/*.json`: model hyperparameters.
+- `CondRefAR/*`: inference code and pipeline.
+- `assets/example`: example images.
+- `app.py`: Gradio demo.
+## Notes
+- Requires a GPU with bfloat16 support for best speed; CPU works but slow.
+- CFG params: `cfg_scale`, `temperature`, `top_k`, `top_p` control quality vs diversity.
+- If you have any questions, please open an issue, or contact [email protected].
+## License
+Apache-2.0 (adjust if different).

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import gradio as gr
+import torch
+import json
+from CondRefAR.pipeline import CondRefARPipeline
+from transformers import AutoTokenizer, T5EncoderModel
+# 简化：直接用 transformers 的 flan-t5-xl 提取文本嵌入
+def build_t5(device, dtype):
+    tok = AutoTokenizer.from_pretrained("google/flan-t5-xl")
+    enc = T5EncoderModel.from_pretrained("google/flan-t5-xl", torch_dtype=dtype)
+    enc = enc.to(device)
+    enc.eval()
+    return tok, enc
+def text_to_emb(prompt, tok, enc, device, dtype):
+    inputs = tok([prompt], return_tensors="pt", padding='max_length', truncation=True, return_attention_mask=True, add_special_tokens=True, max_length=120)
+    with torch.no_grad():
+        out = enc(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device))
+        emb = out['last_hidden_state'].detach()  # [B, T, D]
+    return emb.to(dtype)
+def build_pipeline():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    with open("configs/gpt_config.json","r") as f:
+        gpt_cfg = json.load(f)
+    with open("configs/vq_config.json","r") as f:
+        vq_cfg = json.load(f)
+    pipe = CondRefARPipeline.from_pretrained(".", gpt_cfg, vq_cfg, device=device, torch_dtype=dtype)
+    tok, enc = build_t5(device, dtype)
+    return pipe, tok, enc
+pipe, tok, enc = build_pipeline()
+def infer(prompt, control_image, cfg_scale, temperature, top_k, top_p):
+    emb = text_to_emb(prompt, tok, enc, pipe.device, pipe.dtype)
+    imgs = pipe(emb, control_image['composite'][:, :, :3], cfg_scale=cfg_scale, temperature=temperature, top_k=top_k, top_p=top_p)
+    return imgs[0]
+EXAMPLES = [
+    [
+        "Aerial view of a large industrial area with multiple buildings and roads. There are several roads and highways visible in the image, and there are several parking lots scattered throughout the area.",
+        "assets/examples/example1.jpg",
+        4.0, 1.0, 2000, 1.0,
+    ],
+    [
+        "Aaerial view of a forested area with a river running through it. On the right side of the image, there is a small town or village with a red-roofed building. ",
+        "assets/examples/example2.jpg",
+        5.0, 0.95, 2500, 0.95,
+    ],
+]
+with gr.Blocks(title="CondRef-AR", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## CondRef-AR: Controllable Aerial Image Generation")
+    with gr.Row(equal_height=True):
+        # 左侧：输入区
+        with gr.Column(scale=3):
+            prompt = gr.Textbox(label="Prompt", lines=2, placeholder="Describe the city...")
+            editor = gr.ImageEditor(
+                type="numpy", crop_size="1:1", canvas_size=(512, 512),
+                label="Image"
+            )
+            with gr.Row():
+                btn_gen = gr.Button("Generate", variant="primary")
+                btn_clear = gr.Button("Clear")
+        # 右侧：参数 + 输出 + 示例
+        with gr.Column(scale=2):
+            with gr.Accordion("Advanced settings", open=False):
+                cfg_scale = gr.Slider(1, 8, value=4, step=0.5, label="CFG scale")
+                temperature = gr.Slider(0.5, 1.5, value=1.0, step=0.05, label="Temperature")
+                top_k = gr.Slider(50, 4000, value=2000, step=50, label="top_k")
+                top_p = gr.Slider(0.5, 1.0, value=1.0, step=0.01, label="top_p")
+            output = gr.Image(type="pil", label="Result", height=512)
+            # 可点击示例：点击后自动填充并运行
+            gr.Examples(
+                examples=EXAMPLES,
+                inputs=[prompt, editor, cfg_scale, temperature, top_k, top_p],
+                outputs=output,
+                fn=infer,
+                cache_examples=False,
+                examples_per_page=2,
+                label="Examples"
+            )
+    # 按钮事件
+    btn_gen.click(
+        infer,
+        inputs=[prompt, editor, cfg_scale, temperature, top_k, top_p],
+        outputs=output
+    )
+    btn_clear.click(lambda: (None, None), outputs=[editor, output])
+if __name__ == "__main__":
+    demo.launch()

assets/aerial_img.gif ADDED Viewed

Git LFS Details

SHA256: 05f2288e0bd745fb1d5fcc11150bd89008b4c00954c015db1c43e62c5c8f8723
Pointer size: 133 Bytes
Size of remote file: 13.7 MB

assets/control_img.gif ADDED Viewed

Git LFS Details

SHA256: 1dfe7fff7a07392dce819237c939d2d94035572207c5d8810cbca5039aa331a2
Pointer size: 132 Bytes
Size of remote file: 7.73 MB

assets/evolution.png ADDED Viewed

Git LFS Details

SHA256: 39e9ca004589b22970d99d692fff9b89c56f2b9bd3fe9d41bbc1bb1cbde77727
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

assets/examples/example1.jpg ADDED Viewed

assets/examples/example2.jpg ADDED Viewed

assets/method.jpg ADDED Viewed

Git LFS Details

SHA256: fa759edd96abdc2411ea9c63a4879aed6bfacbef814aea15c76ebe1c75f2455a
Pointer size: 131 Bytes
Size of remote file: 222 kB

assets/samples.png ADDED Viewed

Git LFS Details

SHA256: 5a9c94f23cc1346e33cb67e2ae46b8033d5c3c299d55488fe7c9d090295ee3a6
Pointer size: 132 Bytes
Size of remote file: 2.37 MB

configs/gpt_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "gpt_name": "GPT-XL",
+  "image_size": 512,
+  "downsample_size": 16,
+  "vocab_size": 16384,
+  "num_classes": 1000,
+  "cls_token_num": 120,
+  "model_type": "t2i",
+  "adapter_size": "small",
+  "condition_type": "sketch",
+  "dtype": "bfloat16"
+}

configs/vq_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "model_name": "VQ-16",
+  "image_size": 512,
+  "downsample_size": 16,
+  "n_q": 8,
+  "codebook_size": 16384,
+  "codebook_embed_dim": 8,
+  "latent_channels": 8
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+torch
+transformers
+safetensors
+pillow
+numpy
+xformers

sample.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import json, torch
+from CondRefAR.pipeline import CondRefARPipeline
+from transformers import AutoTokenizer, T5EncoderModel
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+gpt_cfg = json.load(open("configs/gpt_config.json"))
+vq_cfg  = json.load(open("configs/vq_config.json"))
+pipe = CondRefARPipeline.from_pretrained(".", gpt_cfg, vq_cfg, device=device, torch_dtype=dtype)
+tok = AutoTokenizer.from_pretrained("google/flan-t5-xl")
+enc = T5EncoderModel.from_pretrained("google/flan-t5-xl", torch_dtype=dtype).to(device).eval()
+prompt = "Aaerial view of a forested area with a river running through it. On the right side of the image, there is a small town or village with a red-roofed building."
+control = "assets/examples/example2.jpg"
+from PIL import Image, ImageOps
+control_img = Image.open(control).convert("RGB")
+inputs = tok([prompt], return_tensors="pt", padding="max_length", truncation=True, max_length=120)
+with torch.no_grad():
+    emb = enc(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device)).last_hidden_state
+imgs = pipe(emb, control_img, cfg_scale=4, temperature=1.0, top_k=2000, top_p=1.0)
+imgs[0].save("sample.png")

weights/sketch-gpt-xl.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:936bf74f9d71ae61ab183d0b6dc133362e2109f73105d3000d371fe6c2d52f3b
+size 3350054432

weights/vq-16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c98b63a8f1da5c325443a7372fdfeb0ca59037d55d31cfccc6b157041fb924e
+size 287832820