Spaces:
Runtime error
Runtime error
New Update
Browse files- .gitattributes +1 -4
- README-2.md +0 -162
- README.md +8 -5
- app.py +3 -7
- apply_net.py +1 -1
- assets/teaser.png +0 -3
- assets/teaser2.png +0 -3
- environment.yaml +0 -25
- humanparsing/parsing_atr.onnx +0 -3
- humanparsing/parsing_lip.onnx +0 -3
- image_encoder/config.json +0 -23
- image_encoder/model.safetensors +0 -3
- inference.py +0 -425
- inference.sh +0 -34
- inference_dc.py +0 -578
- openpose/ckpts/body_pose_model.pth +0 -3
- requirements.txt +4 -4
- scheduler/scheduler_config.json +0 -19
- text_encoder/config.json +0 -25
- text_encoder/model.safetensors +0 -3
- text_encoder_2/config.json +0 -25
- text_encoder_2/model.safetensors +0 -3
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens_map.json +0 -24
- tokenizer/tokenizer_config.json +0 -33
- tokenizer/vocab.json +0 -0
- tokenizer_2/merges.txt +0 -0
- tokenizer_2/special_tokens_map.json +0 -24
- tokenizer_2/tokenizer_config.json +0 -33
- tokenizer_2/vocab.json +0 -0
- unet/config.json +0 -78
- unet/diffusion_pytorch_model.bin +0 -3
- unet_encoder/config.json +0 -68
- unet_encoder/diffusion_pytorch_model.safetensors +0 -3
- util/common.py +0 -8
- util/image.py +0 -37
- util/pipeline.py +0 -88
- utils_mask.py +1 -1
- vae/config.json +0 -32
- vae/diffusion_pytorch_model.safetensors +0 -3
- vitonhd_test_tagged.json +0 -0
.gitattributes
CHANGED
|
@@ -32,7 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
ckpt/** filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
assets/teaser.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
-
assets/teaser2.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
README-2.md
DELETED
|
@@ -1,162 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
<div align="center">
|
| 3 |
-
<h1>IDM-VTON: Improving Diffusion Models for Authentic Virtual Try-on in the Wild</h1>
|
| 4 |
-
|
| 5 |
-
<a href='https://idm-vton.github.io'><img src='https://img.shields.io/badge/Project-Page-green'></a>
|
| 6 |
-
<a href='https://arxiv.org/abs/2403.05139'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
|
| 7 |
-
<a href='https://huggingface.co/spaces/yisol/IDM-VTON'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue'></a>
|
| 8 |
-
<a href='https://huggingface.co/yisol/IDM-VTON'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
</div>
|
| 12 |
-
|
| 13 |
-
This is the official implementation of the paper ["Improving Diffusion Models for Authentic Virtual Try-on in the Wild"](https://arxiv.org/abs/2403.05139).
|
| 14 |
-
|
| 15 |
-
Star β us if you like it!
|
| 16 |
-
|
| 17 |
-
---
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
<!-- 
|
| 21 |
-
 -->
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
## TODO LIST
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
- [x] demo model
|
| 28 |
-
- [x] inference code
|
| 29 |
-
- [ ] training code
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
## Requirements
|
| 34 |
-
|
| 35 |
-
```
|
| 36 |
-
git clone https://github.com/yisol/IDM-VTON.git
|
| 37 |
-
cd IDM-VTON
|
| 38 |
-
|
| 39 |
-
conda env create -f environment.yaml
|
| 40 |
-
conda activate idm
|
| 41 |
-
```
|
| 42 |
-
|
| 43 |
-
## Data preparation
|
| 44 |
-
|
| 45 |
-
### VITON-HD
|
| 46 |
-
You can download VITON-HD dataset from [VITON-HD](https://github.com/shadow2496/VITON-HD).
|
| 47 |
-
|
| 48 |
-
After download VITON-HD dataset, move vitonhd_test_tagged.json into the test folder.
|
| 49 |
-
|
| 50 |
-
Structure of the Dataset directory should be as follows.
|
| 51 |
-
|
| 52 |
-
```
|
| 53 |
-
|
| 54 |
-
train
|
| 55 |
-
|-- ...
|
| 56 |
-
|
| 57 |
-
test
|
| 58 |
-
|-- image
|
| 59 |
-
|-- image-densepose
|
| 60 |
-
|-- agnostic-mask
|
| 61 |
-
|-- cloth
|
| 62 |
-
|-- vitonhd_test_tagged.json
|
| 63 |
-
|
| 64 |
-
```
|
| 65 |
-
|
| 66 |
-
### DressCode
|
| 67 |
-
You can download DressCode dataset from [DressCode](https://github.com/aimagelab/dress-code).
|
| 68 |
-
|
| 69 |
-
We provide pre-computed densepose images and captions for garments [here](https://kaistackr-my.sharepoint.com/:u:/g/personal/cpis7_kaist_ac_kr/EaIPRG-aiRRIopz9i002FOwBDa-0-BHUKVZ7Ia5yAVVG3A?e=YxkAip).
|
| 70 |
-
|
| 71 |
-
We used [detectron2](https://github.com/facebookresearch/detectron2) for obtaining densepose images, refer [here](https://github.com/sangyun884/HR-VITON/issues/45) for more details.
|
| 72 |
-
|
| 73 |
-
After download the DressCode dataset, place image-densepose directories and caption text files as follows.
|
| 74 |
-
|
| 75 |
-
```
|
| 76 |
-
DressCode
|
| 77 |
-
|-- dresses
|
| 78 |
-
|-- images
|
| 79 |
-
|-- image-densepose
|
| 80 |
-
|-- dc_caption.txt
|
| 81 |
-
|-- ...
|
| 82 |
-
|-- lower_body
|
| 83 |
-
|-- images
|
| 84 |
-
|-- image-densepose
|
| 85 |
-
|-- dc_caption.txt
|
| 86 |
-
|-- ...
|
| 87 |
-
|-- upper_body
|
| 88 |
-
|-- images
|
| 89 |
-
|-- image-densepose
|
| 90 |
-
|-- dc_caption.txt
|
| 91 |
-
|-- ...
|
| 92 |
-
```
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
## Inference
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
### VITON-HD
|
| 99 |
-
|
| 100 |
-
Inference using python file with arguments,
|
| 101 |
-
|
| 102 |
-
```
|
| 103 |
-
accelerate launch inference.py \
|
| 104 |
-
--width 768 --height 1024 --num_inference_steps 30 \
|
| 105 |
-
--output_dir "result" \
|
| 106 |
-
--unpaired \
|
| 107 |
-
--data_dir "DATA_DIR" \
|
| 108 |
-
--seed 42 \
|
| 109 |
-
--test_batch_size 2 \
|
| 110 |
-
--guidance_scale 2.0
|
| 111 |
-
```
|
| 112 |
-
|
| 113 |
-
or, you can simply run with the script file.
|
| 114 |
-
|
| 115 |
-
```
|
| 116 |
-
sh inference.sh
|
| 117 |
-
```
|
| 118 |
-
|
| 119 |
-
### DressCode
|
| 120 |
-
|
| 121 |
-
For DressCode dataset, put the category you want to generate images via category argument,
|
| 122 |
-
```
|
| 123 |
-
accelerate launch inference_dc.py \
|
| 124 |
-
--width 768 --height 1024 --num_inference_steps 30 \
|
| 125 |
-
--output_dir "result" \
|
| 126 |
-
--unpaired \
|
| 127 |
-
--data_dir "DATA_DIR" \
|
| 128 |
-
--seed 42
|
| 129 |
-
--test_batch_size 2
|
| 130 |
-
--guidance_scale 2.0
|
| 131 |
-
--category "upper_body"
|
| 132 |
-
```
|
| 133 |
-
|
| 134 |
-
or, you can simply run with the script file.
|
| 135 |
-
```
|
| 136 |
-
sh inference.sh
|
| 137 |
-
```
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
## Acknowledgements
|
| 141 |
-
|
| 142 |
-
For the [demo](https://huggingface.co/spaces/yisol/IDM-VTON), GPUs are supported from [ZeroGPU](https://huggingface.co/zero-gpu-explorers), and masking generation codes are based on [OOTDiffusion](https://github.com/levihsu/OOTDiffusion) and [DCI-VTON](https://github.com/bcmi/DCI-VTON-Virtual-Try-On).
|
| 143 |
-
|
| 144 |
-
Parts of our code are based on [IP-Adapter](https://github.com/tencent-ailab/IP-Adapter).
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
## Citation
|
| 149 |
-
```
|
| 150 |
-
@article{choi2024improving,
|
| 151 |
-
title={Improving Diffusion Models for Virtual Try-on},
|
| 152 |
-
author={Choi, Yisol and Kwak, Sangkyung and Lee, Kyungmin and Choi, Hyungwon and Shin, Jinwoo},
|
| 153 |
-
journal={arXiv preprint arXiv:2403.05139},
|
| 154 |
-
year={2024}
|
| 155 |
-
}
|
| 156 |
-
```
|
| 157 |
-
|
| 158 |
-
## License
|
| 159 |
-
The codes and checkpoints in this repository are under the [CC BY-NC-SA 4.0 license](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,11 +1,14 @@
|
|
| 1 |
---
|
| 2 |
-
title: AILUSION VTON DEMO
|
| 3 |
-
|
| 4 |
-
|
|
|
|
| 5 |
sdk: gradio
|
| 6 |
-
sdk_version: 4.
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
| 1 |
---
|
| 2 |
+
title: AILUSION VTON DEMO
|
| 3 |
+
emoji: πππ
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.24.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: cc-by-nc-sa-4.0
|
| 11 |
+
short_description: High-fidelity Virtual Try-on
|
| 12 |
---
|
| 13 |
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -40,7 +40,7 @@ def pil_to_binary_mask(pil_image, threshold=0):
|
|
| 40 |
return output_mask
|
| 41 |
|
| 42 |
|
| 43 |
-
base_path = '
|
| 44 |
example_path = os.path.join(os.path.dirname(__file__), 'example')
|
| 45 |
|
| 46 |
unet = UNet2DConditionModel.from_pretrained(
|
|
@@ -88,8 +88,6 @@ UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
|
|
| 88 |
base_path,
|
| 89 |
subfolder="unet_encoder",
|
| 90 |
torch_dtype=torch.float16,
|
| 91 |
-
load_in_8bit=True,
|
| 92 |
-
max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'
|
| 93 |
)
|
| 94 |
|
| 95 |
parsing_model = Parsing(0)
|
|
@@ -122,9 +120,8 @@ pipe = TryonPipeline.from_pretrained(
|
|
| 122 |
torch_dtype=torch.float16,
|
| 123 |
)
|
| 124 |
pipe.unet_encoder = UNet_Encoder
|
| 125 |
-
pipe.to("cuda")
|
| 126 |
|
| 127 |
-
@spaces.GPU
|
| 128 |
def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_steps,seed):
|
| 129 |
device = "cuda"
|
| 130 |
|
|
@@ -263,7 +260,7 @@ for ex_human in human_list_path:
|
|
| 263 |
|
| 264 |
image_blocks = gr.Blocks().queue()
|
| 265 |
with image_blocks as demo:
|
| 266 |
-
gr.Markdown("##
|
| 267 |
gr.Markdown("Virtual Try-on with your image and garment image. Check out the [source codes](https://github.com/yisol/IDM-VTON) and the [model](https://huggingface.co/yisol/IDM-VTON)")
|
| 268 |
with gr.Row():
|
| 269 |
with gr.Column():
|
|
@@ -313,4 +310,3 @@ with image_blocks as demo:
|
|
| 313 |
|
| 314 |
|
| 315 |
image_blocks.launch()
|
| 316 |
-
|
|
|
|
| 40 |
return output_mask
|
| 41 |
|
| 42 |
|
| 43 |
+
base_path = 'yisol/IDM-VTON'
|
| 44 |
example_path = os.path.join(os.path.dirname(__file__), 'example')
|
| 45 |
|
| 46 |
unet = UNet2DConditionModel.from_pretrained(
|
|
|
|
| 88 |
base_path,
|
| 89 |
subfolder="unet_encoder",
|
| 90 |
torch_dtype=torch.float16,
|
|
|
|
|
|
|
| 91 |
)
|
| 92 |
|
| 93 |
parsing_model = Parsing(0)
|
|
|
|
| 120 |
torch_dtype=torch.float16,
|
| 121 |
)
|
| 122 |
pipe.unet_encoder = UNet_Encoder
|
|
|
|
| 123 |
|
| 124 |
+
@spaces.GPU
|
| 125 |
def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_steps,seed):
|
| 126 |
device = "cuda"
|
| 127 |
|
|
|
|
| 260 |
|
| 261 |
image_blocks = gr.Blocks().queue()
|
| 262 |
with image_blocks as demo:
|
| 263 |
+
gr.Markdown("## IDM-VTON πππ")
|
| 264 |
gr.Markdown("Virtual Try-on with your image and garment image. Check out the [source codes](https://github.com/yisol/IDM-VTON) and the [model](https://huggingface.co/yisol/IDM-VTON)")
|
| 265 |
with gr.Row():
|
| 266 |
with gr.Column():
|
|
|
|
| 310 |
|
| 311 |
|
| 312 |
image_blocks.launch()
|
|
|
apply_net.py
CHANGED
|
@@ -356,4 +356,4 @@ if __name__ == "__main__":
|
|
| 356 |
main()
|
| 357 |
|
| 358 |
|
| 359 |
-
# python ./apply_net.py show ./configs/densepose_rcnn_R_50_FPN_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl /home/alin0222/Dresscode/dresses/humanonly dp_segm -v --opts MODEL.DEVICE cuda
|
|
|
|
| 356 |
main()
|
| 357 |
|
| 358 |
|
| 359 |
+
# python ./apply_net.py show ./configs/densepose_rcnn_R_50_FPN_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl /home/alin0222/Dresscode/dresses/humanonly dp_segm -v --opts MODEL.DEVICE cuda
|
assets/teaser.png
DELETED
Git LFS Details
|
assets/teaser2.png
DELETED
Git LFS Details
|
environment.yaml
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
name: idm
|
| 2 |
-
channels:
|
| 3 |
-
- pytorch
|
| 4 |
-
- nvidia
|
| 5 |
-
- defaults
|
| 6 |
-
dependencies:
|
| 7 |
-
- python=3.10.0=h12debd9_5
|
| 8 |
-
- pytorch=2.0.1=py3.10_cuda11.8_cudnn8.7.0_0
|
| 9 |
-
- pytorch-cuda=11.8=h7e8668a_5
|
| 10 |
-
- torchaudio=2.0.2=py310_cu118
|
| 11 |
-
- torchtriton=2.0.0=py310
|
| 12 |
-
- torchvision=0.15.2=py310_cu118
|
| 13 |
-
- pip=23.3.1=py310h06a4308_0
|
| 14 |
-
|
| 15 |
-
- pip:
|
| 16 |
-
- accelerate==0.25.0
|
| 17 |
-
- torchmetrics==1.2.1
|
| 18 |
-
- tqdm==4.66.1
|
| 19 |
-
- transformers==4.36.2
|
| 20 |
-
- diffusers==0.25.0
|
| 21 |
-
- einops==0.7.0
|
| 22 |
-
- bitsandbytes==0.39.0
|
| 23 |
-
- scipy==1.11.1
|
| 24 |
-
- opencv-python
|
| 25 |
-
- spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
humanparsing/parsing_atr.onnx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:04c7d1d070d0e0ae943d86b18cb5aaaea9e278d97462e9cfb270cbbe4cd977f4
|
| 3 |
-
size 266859305
|
|
|
|
|
|
|
|
|
|
|
|
humanparsing/parsing_lip.onnx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8436e1dae96e2601c373d1ace29c8f0978b16357d9038c17a8ba756cca376dbc
|
| 3 |
-
size 266863411
|
|
|
|
|
|
|
|
|
|
|
|
image_encoder/config.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_name_or_path": "./image_encoder",
|
| 3 |
-
"architectures": [
|
| 4 |
-
"CLIPVisionModelWithProjection"
|
| 5 |
-
],
|
| 6 |
-
"attention_dropout": 0.0,
|
| 7 |
-
"dropout": 0.0,
|
| 8 |
-
"hidden_act": "gelu",
|
| 9 |
-
"hidden_size": 1280,
|
| 10 |
-
"image_size": 224,
|
| 11 |
-
"initializer_factor": 1.0,
|
| 12 |
-
"initializer_range": 0.02,
|
| 13 |
-
"intermediate_size": 5120,
|
| 14 |
-
"layer_norm_eps": 1e-05,
|
| 15 |
-
"model_type": "clip_vision_model",
|
| 16 |
-
"num_attention_heads": 16,
|
| 17 |
-
"num_channels": 3,
|
| 18 |
-
"num_hidden_layers": 32,
|
| 19 |
-
"patch_size": 14,
|
| 20 |
-
"projection_dim": 1024,
|
| 21 |
-
"torch_dtype": "float16",
|
| 22 |
-
"transformers_version": "4.28.0.dev0"
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_encoder/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6ca9667da1ca9e0b0f75e46bb030f7e011f44f86cbfb8d5a36590fcd7507b030
|
| 3 |
-
size 2528373448
|
|
|
|
|
|
|
|
|
|
|
|
inference.py
DELETED
|
@@ -1,425 +0,0 @@
|
|
| 1 |
-
# coding=utf-8
|
| 2 |
-
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
-
# you may not use this file except in compliance with the License.
|
| 6 |
-
# You may obtain a copy of the License at
|
| 7 |
-
#
|
| 8 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
-
#
|
| 10 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
-
# See the License for the specific language governing permissions and
|
| 14 |
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Literal
|
| 15 |
-
from ip_adapter.ip_adapter import Resampler
|
| 16 |
-
|
| 17 |
-
import argparse
|
| 18 |
-
import logging
|
| 19 |
-
import os
|
| 20 |
-
import torch.utils.data as data
|
| 21 |
-
import torchvision
|
| 22 |
-
import json
|
| 23 |
-
import accelerate
|
| 24 |
-
import numpy as np
|
| 25 |
-
import torch
|
| 26 |
-
from PIL import Image
|
| 27 |
-
import torch.nn.functional as F
|
| 28 |
-
import transformers
|
| 29 |
-
from accelerate import Accelerator
|
| 30 |
-
from accelerate.logging import get_logger
|
| 31 |
-
from accelerate.utils import ProjectConfiguration, set_seed
|
| 32 |
-
from packaging import version
|
| 33 |
-
from torchvision import transforms
|
| 34 |
-
import diffusers
|
| 35 |
-
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, StableDiffusionXLControlNetInpaintPipeline
|
| 36 |
-
from transformers import AutoTokenizer, PretrainedConfig,CLIPImageProcessor, CLIPVisionModelWithProjection,CLIPTextModelWithProjection, CLIPTextModel, CLIPTokenizer
|
| 37 |
-
|
| 38 |
-
from diffusers.utils.import_utils import is_xformers_available
|
| 39 |
-
|
| 40 |
-
from src.unet_hacked_tryon import UNet2DConditionModel
|
| 41 |
-
from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
|
| 42 |
-
from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
logger = get_logger(__name__, log_level="INFO")
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def parse_args():
|
| 51 |
-
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
| 52 |
-
parser.add_argument("--pretrained_model_name_or_path",type=str,default= "yisol/IDM-VTON",required=False,)
|
| 53 |
-
parser.add_argument("--width",type=int,default=768,)
|
| 54 |
-
parser.add_argument("--height",type=int,default=1024,)
|
| 55 |
-
parser.add_argument("--num_inference_steps",type=int,default=30,)
|
| 56 |
-
parser.add_argument("--output_dir",type=str,default="result",)
|
| 57 |
-
parser.add_argument("--unpaired",action="store_true",)
|
| 58 |
-
parser.add_argument("--data_dir",type=str,default="/home/omnious/workspace/yisol/Dataset/zalando")
|
| 59 |
-
parser.add_argument("--seed", type=int, default=42,)
|
| 60 |
-
parser.add_argument("--test_batch_size", type=int, default=2,)
|
| 61 |
-
parser.add_argument("--guidance_scale",type=float,default=2.0,)
|
| 62 |
-
parser.add_argument("--mixed_precision",type=str,default=None,choices=["no", "fp16", "bf16"],)
|
| 63 |
-
parser.add_argument("--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers.")
|
| 64 |
-
args = parser.parse_args()
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
return args
|
| 68 |
-
|
| 69 |
-
def pil_to_tensor(images):
|
| 70 |
-
images = np.array(images).astype(np.float32) / 255.0
|
| 71 |
-
images = torch.from_numpy(images.transpose(2, 0, 1))
|
| 72 |
-
return images
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
class VitonHDTestDataset(data.Dataset):
|
| 76 |
-
def __init__(
|
| 77 |
-
self,
|
| 78 |
-
dataroot_path: str,
|
| 79 |
-
phase: Literal["train", "test"],
|
| 80 |
-
order: Literal["paired", "unpaired"] = "paired",
|
| 81 |
-
size: Tuple[int, int] = (512, 384),
|
| 82 |
-
):
|
| 83 |
-
super(VitonHDTestDataset, self).__init__()
|
| 84 |
-
self.dataroot = dataroot_path
|
| 85 |
-
self.phase = phase
|
| 86 |
-
self.height = size[0]
|
| 87 |
-
self.width = size[1]
|
| 88 |
-
self.size = size
|
| 89 |
-
self.transform = transforms.Compose(
|
| 90 |
-
[
|
| 91 |
-
transforms.ToTensor(),
|
| 92 |
-
transforms.Normalize([0.5], [0.5]),
|
| 93 |
-
]
|
| 94 |
-
)
|
| 95 |
-
self.toTensor = transforms.ToTensor()
|
| 96 |
-
|
| 97 |
-
with open(
|
| 98 |
-
os.path.join(dataroot_path, phase, "vitonhd_" + phase + "_tagged.json"), "r"
|
| 99 |
-
) as file1:
|
| 100 |
-
data1 = json.load(file1)
|
| 101 |
-
|
| 102 |
-
annotation_list = [
|
| 103 |
-
"sleeveLength",
|
| 104 |
-
"neckLine",
|
| 105 |
-
"item",
|
| 106 |
-
]
|
| 107 |
-
|
| 108 |
-
self.annotation_pair = {}
|
| 109 |
-
for k, v in data1.items():
|
| 110 |
-
for elem in v:
|
| 111 |
-
annotation_str = ""
|
| 112 |
-
for template in annotation_list:
|
| 113 |
-
for tag in elem["tag_info"]:
|
| 114 |
-
if (
|
| 115 |
-
tag["tag_name"] == template
|
| 116 |
-
and tag["tag_category"] is not None
|
| 117 |
-
):
|
| 118 |
-
annotation_str += tag["tag_category"]
|
| 119 |
-
annotation_str += " "
|
| 120 |
-
self.annotation_pair[elem["file_name"]] = annotation_str
|
| 121 |
-
|
| 122 |
-
self.order = order
|
| 123 |
-
self.toTensor = transforms.ToTensor()
|
| 124 |
-
|
| 125 |
-
im_names = []
|
| 126 |
-
c_names = []
|
| 127 |
-
dataroot_names = []
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
if phase == "train":
|
| 131 |
-
filename = os.path.join(dataroot_path, f"{phase}_pairs.txt")
|
| 132 |
-
else:
|
| 133 |
-
filename = os.path.join(dataroot_path, f"{phase}_pairs.txt")
|
| 134 |
-
|
| 135 |
-
with open(filename, "r") as f:
|
| 136 |
-
for line in f.readlines():
|
| 137 |
-
if phase == "train":
|
| 138 |
-
im_name, _ = line.strip().split()
|
| 139 |
-
c_name = im_name
|
| 140 |
-
else:
|
| 141 |
-
if order == "paired":
|
| 142 |
-
im_name, _ = line.strip().split()
|
| 143 |
-
c_name = im_name
|
| 144 |
-
else:
|
| 145 |
-
im_name, c_name = line.strip().split()
|
| 146 |
-
|
| 147 |
-
im_names.append(im_name)
|
| 148 |
-
c_names.append(c_name)
|
| 149 |
-
dataroot_names.append(dataroot_path)
|
| 150 |
-
|
| 151 |
-
self.im_names = im_names
|
| 152 |
-
self.c_names = c_names
|
| 153 |
-
self.dataroot_names = dataroot_names
|
| 154 |
-
self.clip_processor = CLIPImageProcessor()
|
| 155 |
-
def __getitem__(self, index):
|
| 156 |
-
c_name = self.c_names[index]
|
| 157 |
-
im_name = self.im_names[index]
|
| 158 |
-
if c_name in self.annotation_pair:
|
| 159 |
-
cloth_annotation = self.annotation_pair[c_name]
|
| 160 |
-
else:
|
| 161 |
-
cloth_annotation = "shirts"
|
| 162 |
-
cloth = Image.open(os.path.join(self.dataroot, self.phase, "cloth", c_name))
|
| 163 |
-
|
| 164 |
-
im_pil_big = Image.open(
|
| 165 |
-
os.path.join(self.dataroot, self.phase, "image", im_name)
|
| 166 |
-
).resize((self.width,self.height))
|
| 167 |
-
image = self.transform(im_pil_big)
|
| 168 |
-
|
| 169 |
-
mask = Image.open(os.path.join(self.dataroot, self.phase, "agnostic-mask", im_name.replace('.jpg','_mask.png'))).resize((self.width,self.height))
|
| 170 |
-
mask = self.toTensor(mask)
|
| 171 |
-
mask = mask[:1]
|
| 172 |
-
mask = 1-mask
|
| 173 |
-
im_mask = image * mask
|
| 174 |
-
|
| 175 |
-
pose_img = Image.open(
|
| 176 |
-
os.path.join(self.dataroot, self.phase, "image-densepose", im_name)
|
| 177 |
-
)
|
| 178 |
-
pose_img = self.transform(pose_img) # [-1,1]
|
| 179 |
-
|
| 180 |
-
result = {}
|
| 181 |
-
result["c_name"] = c_name
|
| 182 |
-
result["im_name"] = im_name
|
| 183 |
-
result["image"] = image
|
| 184 |
-
result["cloth_pure"] = self.transform(cloth)
|
| 185 |
-
result["cloth"] = self.clip_processor(images=cloth, return_tensors="pt").pixel_values
|
| 186 |
-
result["inpaint_mask"] =1-mask
|
| 187 |
-
result["im_mask"] = im_mask
|
| 188 |
-
result["caption_cloth"] = "a photo of " + cloth_annotation
|
| 189 |
-
result["caption"] = "model is wearing a " + cloth_annotation
|
| 190 |
-
result["pose_img"] = pose_img
|
| 191 |
-
|
| 192 |
-
return result
|
| 193 |
-
|
| 194 |
-
def __len__(self):
|
| 195 |
-
# model images + cloth image
|
| 196 |
-
return len(self.im_names)
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
def main():
|
| 202 |
-
args = parse_args()
|
| 203 |
-
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir)
|
| 204 |
-
accelerator = Accelerator(
|
| 205 |
-
mixed_precision=args.mixed_precision,
|
| 206 |
-
project_config=accelerator_project_config,
|
| 207 |
-
)
|
| 208 |
-
if accelerator.is_local_main_process:
|
| 209 |
-
transformers.utils.logging.set_verbosity_warning()
|
| 210 |
-
diffusers.utils.logging.set_verbosity_info()
|
| 211 |
-
else:
|
| 212 |
-
transformers.utils.logging.set_verbosity_error()
|
| 213 |
-
diffusers.utils.logging.set_verbosity_error()
|
| 214 |
-
# If passed along, set the training seed now.
|
| 215 |
-
if args.seed is not None:
|
| 216 |
-
set_seed(args.seed)
|
| 217 |
-
|
| 218 |
-
# Handle the repository creation
|
| 219 |
-
if accelerator.is_main_process:
|
| 220 |
-
if args.output_dir is not None:
|
| 221 |
-
os.makedirs(args.output_dir, exist_ok=True)
|
| 222 |
-
|
| 223 |
-
weight_dtype = torch.float16
|
| 224 |
-
# if accelerator.mixed_precision == "fp16":
|
| 225 |
-
# weight_dtype = torch.float16
|
| 226 |
-
# args.mixed_precision = accelerator.mixed_precision
|
| 227 |
-
# elif accelerator.mixed_precision == "bf16":
|
| 228 |
-
# weight_dtype = torch.bfloat16
|
| 229 |
-
# args.mixed_precision = accelerator.mixed_precision
|
| 230 |
-
|
| 231 |
-
# Load scheduler, tokenizer and models.
|
| 232 |
-
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
| 233 |
-
vae = AutoencoderKL.from_pretrained(
|
| 234 |
-
args.pretrained_model_name_or_path,
|
| 235 |
-
subfolder="vae",
|
| 236 |
-
torch_dtype=torch.float16,
|
| 237 |
-
)
|
| 238 |
-
unet = UNet2DConditionModel.from_pretrained(
|
| 239 |
-
args.pretrained_model_name_or_path,
|
| 240 |
-
subfolder="unet",
|
| 241 |
-
torch_dtype=torch.float16,
|
| 242 |
-
)
|
| 243 |
-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
| 244 |
-
args.pretrained_model_name_or_path,
|
| 245 |
-
subfolder="image_encoder",
|
| 246 |
-
torch_dtype=torch.float16,
|
| 247 |
-
)
|
| 248 |
-
UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
|
| 249 |
-
args.pretrained_model_name_or_path,
|
| 250 |
-
subfolder="unet_encoder",
|
| 251 |
-
torch_dtype=torch.float16,
|
| 252 |
-
)
|
| 253 |
-
text_encoder_one = CLIPTextModel.from_pretrained(
|
| 254 |
-
args.pretrained_model_name_or_path,
|
| 255 |
-
subfolder="text_encoder",
|
| 256 |
-
torch_dtype=torch.float16,
|
| 257 |
-
)
|
| 258 |
-
text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
|
| 259 |
-
args.pretrained_model_name_or_path,
|
| 260 |
-
subfolder="text_encoder_2",
|
| 261 |
-
torch_dtype=torch.float16,
|
| 262 |
-
)
|
| 263 |
-
tokenizer_one = AutoTokenizer.from_pretrained(
|
| 264 |
-
args.pretrained_model_name_or_path,
|
| 265 |
-
subfolder="tokenizer",
|
| 266 |
-
revision=None,
|
| 267 |
-
use_fast=False,
|
| 268 |
-
)
|
| 269 |
-
tokenizer_two = AutoTokenizer.from_pretrained(
|
| 270 |
-
args.pretrained_model_name_or_path,
|
| 271 |
-
subfolder="tokenizer_2",
|
| 272 |
-
revision=None,
|
| 273 |
-
use_fast=False,
|
| 274 |
-
)
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
# Freeze vae and text_encoder and set unet to trainable
|
| 278 |
-
unet.requires_grad_(False)
|
| 279 |
-
vae.requires_grad_(False)
|
| 280 |
-
image_encoder.requires_grad_(False)
|
| 281 |
-
UNet_Encoder.requires_grad_(False)
|
| 282 |
-
text_encoder_one.requires_grad_(False)
|
| 283 |
-
text_encoder_two.requires_grad_(False)
|
| 284 |
-
UNet_Encoder.to(accelerator.device, weight_dtype)
|
| 285 |
-
unet.eval()
|
| 286 |
-
UNet_Encoder.eval()
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
if args.enable_xformers_memory_efficient_attention:
|
| 291 |
-
if is_xformers_available():
|
| 292 |
-
import xformers
|
| 293 |
-
|
| 294 |
-
xformers_version = version.parse(xformers.__version__)
|
| 295 |
-
if xformers_version == version.parse("0.0.16"):
|
| 296 |
-
logger.warn(
|
| 297 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
| 298 |
-
)
|
| 299 |
-
unet.enable_xformers_memory_efficient_attention()
|
| 300 |
-
else:
|
| 301 |
-
raise ValueError("xformers is not available. Make sure it is installed correctly")
|
| 302 |
-
|
| 303 |
-
test_dataset = VitonHDTestDataset(
|
| 304 |
-
dataroot_path=args.data_dir,
|
| 305 |
-
phase="test",
|
| 306 |
-
order="unpaired" if args.unpaired else "paired",
|
| 307 |
-
size=(args.height, args.width),
|
| 308 |
-
)
|
| 309 |
-
test_dataloader = torch.utils.data.DataLoader(
|
| 310 |
-
test_dataset,
|
| 311 |
-
shuffle=False,
|
| 312 |
-
batch_size=args.test_batch_size,
|
| 313 |
-
num_workers=4,
|
| 314 |
-
)
|
| 315 |
-
|
| 316 |
-
pipe = TryonPipeline.from_pretrained(
|
| 317 |
-
args.pretrained_model_name_or_path,
|
| 318 |
-
unet=unet,
|
| 319 |
-
vae=vae,
|
| 320 |
-
feature_extractor= CLIPImageProcessor(),
|
| 321 |
-
text_encoder = text_encoder_one,
|
| 322 |
-
text_encoder_2 = text_encoder_two,
|
| 323 |
-
tokenizer = tokenizer_one,
|
| 324 |
-
tokenizer_2 = tokenizer_two,
|
| 325 |
-
scheduler = noise_scheduler,
|
| 326 |
-
image_encoder=image_encoder,
|
| 327 |
-
torch_dtype=torch.float16,
|
| 328 |
-
).to(accelerator.device)
|
| 329 |
-
pipe.unet_encoder = UNet_Encoder
|
| 330 |
-
|
| 331 |
-
# pipe.enable_sequential_cpu_offload()
|
| 332 |
-
# pipe.enable_model_cpu_offload()
|
| 333 |
-
# pipe.enable_vae_slicing()
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
with torch.no_grad():
|
| 338 |
-
# Extract the images
|
| 339 |
-
with torch.cuda.amp.autocast():
|
| 340 |
-
with torch.no_grad():
|
| 341 |
-
for sample in test_dataloader:
|
| 342 |
-
img_emb_list = []
|
| 343 |
-
for i in range(sample['cloth'].shape[0]):
|
| 344 |
-
img_emb_list.append(sample['cloth'][i])
|
| 345 |
-
|
| 346 |
-
prompt = sample["caption"]
|
| 347 |
-
|
| 348 |
-
num_prompts = sample['cloth'].shape[0]
|
| 349 |
-
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
| 350 |
-
|
| 351 |
-
if not isinstance(prompt, List):
|
| 352 |
-
prompt = [prompt] * num_prompts
|
| 353 |
-
if not isinstance(negative_prompt, List):
|
| 354 |
-
negative_prompt = [negative_prompt] * num_prompts
|
| 355 |
-
|
| 356 |
-
image_embeds = torch.cat(img_emb_list,dim=0)
|
| 357 |
-
|
| 358 |
-
with torch.inference_mode():
|
| 359 |
-
(
|
| 360 |
-
prompt_embeds,
|
| 361 |
-
negative_prompt_embeds,
|
| 362 |
-
pooled_prompt_embeds,
|
| 363 |
-
negative_pooled_prompt_embeds,
|
| 364 |
-
) = pipe.encode_prompt(
|
| 365 |
-
prompt,
|
| 366 |
-
num_images_per_prompt=1,
|
| 367 |
-
do_classifier_free_guidance=True,
|
| 368 |
-
negative_prompt=negative_prompt,
|
| 369 |
-
)
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
prompt = sample["caption_cloth"]
|
| 373 |
-
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
| 374 |
-
|
| 375 |
-
if not isinstance(prompt, List):
|
| 376 |
-
prompt = [prompt] * num_prompts
|
| 377 |
-
if not isinstance(negative_prompt, List):
|
| 378 |
-
negative_prompt = [negative_prompt] * num_prompts
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
with torch.inference_mode():
|
| 382 |
-
(
|
| 383 |
-
prompt_embeds_c,
|
| 384 |
-
_,
|
| 385 |
-
_,
|
| 386 |
-
_,
|
| 387 |
-
) = pipe.encode_prompt(
|
| 388 |
-
prompt,
|
| 389 |
-
num_images_per_prompt=1,
|
| 390 |
-
do_classifier_free_guidance=False,
|
| 391 |
-
negative_prompt=negative_prompt,
|
| 392 |
-
)
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
generator = torch.Generator(pipe.device).manual_seed(args.seed) if args.seed is not None else None
|
| 397 |
-
images = pipe(
|
| 398 |
-
prompt_embeds=prompt_embeds,
|
| 399 |
-
negative_prompt_embeds=negative_prompt_embeds,
|
| 400 |
-
pooled_prompt_embeds=pooled_prompt_embeds,
|
| 401 |
-
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
| 402 |
-
num_inference_steps=args.num_inference_steps,
|
| 403 |
-
generator=generator,
|
| 404 |
-
strength = 1.0,
|
| 405 |
-
pose_img = sample['pose_img'],
|
| 406 |
-
text_embeds_cloth=prompt_embeds_c,
|
| 407 |
-
cloth = sample["cloth_pure"].to(accelerator.device),
|
| 408 |
-
mask_image=sample['inpaint_mask'],
|
| 409 |
-
image=(sample['image']+1.0)/2.0,
|
| 410 |
-
height=args.height,
|
| 411 |
-
width=args.width,
|
| 412 |
-
guidance_scale=args.guidance_scale,
|
| 413 |
-
ip_adapter_image = image_embeds,
|
| 414 |
-
)[0]
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
for i in range(len(images)):
|
| 418 |
-
x_sample = pil_to_tensor(images[i])
|
| 419 |
-
torchvision.utils.save_image(x_sample,os.path.join(args.output_dir,sample['im_name'][i]))
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
if __name__ == "__main__":
|
| 425 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference.sh
DELETED
|
@@ -1,34 +0,0 @@
|
|
| 1 |
-
#VITON-HD
|
| 2 |
-
##paired setting
|
| 3 |
-
accelerate launch inference.py --pretrained_model_name_or_path "yisol/IDM-VTON" \
|
| 4 |
-
--width 768 --height 1024 --num_inference_steps 30 \
|
| 5 |
-
--output_dir "result" --data_dir "/home/omnious/workspace/yisol/Dataset/zalando" \
|
| 6 |
-
--seed 42 --test_batch_size 2 --guidance_scale 2.0
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
##unpaired setting
|
| 10 |
-
accelerate launch inference.py --pretrained_model_name_or_path "yisol/IDM-VTON" \
|
| 11 |
-
--width 768 --height 1024 --num_inference_steps 30 \
|
| 12 |
-
--output_dir "result" --unpaired --data_dir "/home/omnious/workspace/yisol/Dataset/zalando" \
|
| 13 |
-
--seed 42 --test_batch_size 2 --guidance_scale 2.0
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
#DressCode
|
| 18 |
-
##upper_body
|
| 19 |
-
accelerate launch inference_dc.py --pretrained_model_name_or_path "yisol/IDM-VTON" \
|
| 20 |
-
--width 768 --height 1024 --num_inference_steps 30 \
|
| 21 |
-
--output_dir "result" --unpaired --data_dir "/home/omnious/workspace/yisol/DressCode" \
|
| 22 |
-
--seed 42 --test_batch_size 2 --guidance_scale 2.0 --category "upper_body"
|
| 23 |
-
|
| 24 |
-
##lower_body
|
| 25 |
-
accelerate launch inference_dc.py --pretrained_model_name_or_path "yisol/IDM-VTON" \
|
| 26 |
-
--width 768 --height 1024 --num_inference_steps 30 \
|
| 27 |
-
--output_dir "result" --unpaired --data_dir "/home/omnious/workspace/yisol/DressCode" \
|
| 28 |
-
--seed 42 --test_batch_size 2 --guidance_scale 2.0 --category "lower_body"
|
| 29 |
-
|
| 30 |
-
##dresses
|
| 31 |
-
accelerate launch inference_dc.py --pretrained_model_name_or_path "yisol/IDM-VTON" \
|
| 32 |
-
--width 768 --height 1024 --num_inference_steps 30 \
|
| 33 |
-
--output_dir "result" --unpaired --data_dir "/home/omnious/workspace/yisol/DressCode" \
|
| 34 |
-
--seed 42 --test_batch_size 2 --guidance_scale 2.0 --category "dresses"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference_dc.py
DELETED
|
@@ -1,578 +0,0 @@
|
|
| 1 |
-
# coding=utf-8
|
| 2 |
-
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
-
# you may not use this file except in compliance with the License.
|
| 6 |
-
# You may obtain a copy of the License at
|
| 7 |
-
#
|
| 8 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
-
#
|
| 10 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
-
# See the License for the specific language governing permissions and
|
| 14 |
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Literal
|
| 15 |
-
from ip_adapter.ip_adapter import Resampler
|
| 16 |
-
|
| 17 |
-
import argparse
|
| 18 |
-
import logging
|
| 19 |
-
import os
|
| 20 |
-
import torch.utils.data as data
|
| 21 |
-
import torchvision
|
| 22 |
-
import json
|
| 23 |
-
import accelerate
|
| 24 |
-
import numpy as np
|
| 25 |
-
import torch
|
| 26 |
-
from PIL import Image, ImageDraw
|
| 27 |
-
import torch.nn.functional as F
|
| 28 |
-
import transformers
|
| 29 |
-
from accelerate import Accelerator
|
| 30 |
-
from accelerate.logging import get_logger
|
| 31 |
-
from accelerate.utils import ProjectConfiguration, set_seed
|
| 32 |
-
from packaging import version
|
| 33 |
-
from torchvision import transforms
|
| 34 |
-
import diffusers
|
| 35 |
-
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, StableDiffusionXLControlNetInpaintPipeline
|
| 36 |
-
from transformers import AutoTokenizer, PretrainedConfig,CLIPImageProcessor, CLIPVisionModelWithProjection,CLIPTextModelWithProjection, CLIPTextModel, CLIPTokenizer
|
| 37 |
-
import cv2
|
| 38 |
-
from diffusers.utils.import_utils import is_xformers_available
|
| 39 |
-
from numpy.linalg import lstsq
|
| 40 |
-
|
| 41 |
-
from src.unet_hacked_tryon import UNet2DConditionModel
|
| 42 |
-
from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
|
| 43 |
-
from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
logger = get_logger(__name__, log_level="INFO")
|
| 48 |
-
|
| 49 |
-
label_map={
|
| 50 |
-
"background": 0,
|
| 51 |
-
"hat": 1,
|
| 52 |
-
"hair": 2,
|
| 53 |
-
"sunglasses": 3,
|
| 54 |
-
"upper_clothes": 4,
|
| 55 |
-
"skirt": 5,
|
| 56 |
-
"pants": 6,
|
| 57 |
-
"dress": 7,
|
| 58 |
-
"belt": 8,
|
| 59 |
-
"left_shoe": 9,
|
| 60 |
-
"right_shoe": 10,
|
| 61 |
-
"head": 11,
|
| 62 |
-
"left_leg": 12,
|
| 63 |
-
"right_leg": 13,
|
| 64 |
-
"left_arm": 14,
|
| 65 |
-
"right_arm": 15,
|
| 66 |
-
"bag": 16,
|
| 67 |
-
"scarf": 17,
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
def parse_args():
|
| 71 |
-
parser = argparse.ArgumentParser(description="Simple example of a training script.")
|
| 72 |
-
parser.add_argument("--pretrained_model_name_or_path",type=str,default= "yisol/IDM-VTON",required=False,)
|
| 73 |
-
parser.add_argument("--width",type=int,default=768,)
|
| 74 |
-
parser.add_argument("--height",type=int,default=1024,)
|
| 75 |
-
parser.add_argument("--num_inference_steps",type=int,default=30,)
|
| 76 |
-
parser.add_argument("--output_dir",type=str,default="result",)
|
| 77 |
-
parser.add_argument("--category",type=str,default="upper_body",choices=["upper_body", "lower_body", "dresses"])
|
| 78 |
-
parser.add_argument("--unpaired",action="store_true",)
|
| 79 |
-
parser.add_argument("--data_dir",type=str,default="/home/omnious/workspace/yisol/Dataset/zalando")
|
| 80 |
-
parser.add_argument("--seed", type=int, default=42,)
|
| 81 |
-
parser.add_argument("--test_batch_size", type=int, default=2,)
|
| 82 |
-
parser.add_argument("--guidance_scale",type=float,default=2.0,)
|
| 83 |
-
parser.add_argument("--mixed_precision",type=str,default=None,choices=["no", "fp16", "bf16"],)
|
| 84 |
-
parser.add_argument("--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers.")
|
| 85 |
-
args = parser.parse_args()
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
return args
|
| 89 |
-
|
| 90 |
-
def pil_to_tensor(images):
|
| 91 |
-
images = np.array(images).astype(np.float32) / 255.0
|
| 92 |
-
images = torch.from_numpy(images.transpose(2, 0, 1))
|
| 93 |
-
return images
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
class DresscodeTestDataset(data.Dataset):
|
| 97 |
-
def __init__(
|
| 98 |
-
self,
|
| 99 |
-
dataroot_path: str,
|
| 100 |
-
phase: Literal["train", "test"],
|
| 101 |
-
order: Literal["paired", "unpaired"] = "paired",
|
| 102 |
-
category = "upper_body",
|
| 103 |
-
size: Tuple[int, int] = (512, 384),
|
| 104 |
-
):
|
| 105 |
-
super(DresscodeTestDataset, self).__init__()
|
| 106 |
-
self.dataroot = os.path.join(dataroot_path,category)
|
| 107 |
-
self.phase = phase
|
| 108 |
-
self.height = size[0]
|
| 109 |
-
self.width = size[1]
|
| 110 |
-
self.size = size
|
| 111 |
-
self.transform = transforms.Compose(
|
| 112 |
-
[
|
| 113 |
-
transforms.ToTensor(),
|
| 114 |
-
transforms.Normalize([0.5], [0.5]),
|
| 115 |
-
]
|
| 116 |
-
)
|
| 117 |
-
self.toTensor = transforms.ToTensor()
|
| 118 |
-
self.order = order
|
| 119 |
-
self.radius = 5
|
| 120 |
-
self.category = category
|
| 121 |
-
im_names = []
|
| 122 |
-
c_names = []
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
if phase == "train":
|
| 126 |
-
filename = os.path.join(dataroot_path,category, f"{phase}_pairs.txt")
|
| 127 |
-
else:
|
| 128 |
-
filename = os.path.join(dataroot_path,category, f"{phase}_pairs_{order}.txt")
|
| 129 |
-
|
| 130 |
-
with open(filename, "r") as f:
|
| 131 |
-
for line in f.readlines():
|
| 132 |
-
im_name, c_name = line.strip().split()
|
| 133 |
-
|
| 134 |
-
im_names.append(im_name)
|
| 135 |
-
c_names.append(c_name)
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
file_path = os.path.join(dataroot_path,category,"dc_caption.txt")
|
| 139 |
-
|
| 140 |
-
self.annotation_pair = {}
|
| 141 |
-
with open(file_path, "r") as file:
|
| 142 |
-
for line in file:
|
| 143 |
-
parts = line.strip().split(" ")
|
| 144 |
-
self.annotation_pair[parts[0]] = ' '.join(parts[1:])
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
self.im_names = im_names
|
| 148 |
-
self.c_names = c_names
|
| 149 |
-
self.clip_processor = CLIPImageProcessor()
|
| 150 |
-
def __getitem__(self, index):
|
| 151 |
-
c_name = self.c_names[index]
|
| 152 |
-
im_name = self.im_names[index]
|
| 153 |
-
if c_name in self.annotation_pair:
|
| 154 |
-
cloth_annotation = self.annotation_pair[c_name]
|
| 155 |
-
else:
|
| 156 |
-
cloth_annotation = self.category
|
| 157 |
-
cloth = Image.open(os.path.join(self.dataroot, "images", c_name))
|
| 158 |
-
|
| 159 |
-
im_pil_big = Image.open(
|
| 160 |
-
os.path.join(self.dataroot, "images", im_name)
|
| 161 |
-
).resize((self.width,self.height))
|
| 162 |
-
image = self.transform(im_pil_big)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
skeleton = Image.open(os.path.join(self.dataroot, 'skeletons', im_name.replace("_0", "_5")))
|
| 168 |
-
skeleton = skeleton.resize((self.width, self.height))
|
| 169 |
-
skeleton = self.transform(skeleton)
|
| 170 |
-
|
| 171 |
-
# Label Map
|
| 172 |
-
parse_name = im_name.replace('_0.jpg', '_4.png')
|
| 173 |
-
im_parse = Image.open(os.path.join(self.dataroot, 'label_maps', parse_name))
|
| 174 |
-
im_parse = im_parse.resize((self.width, self.height), Image.NEAREST)
|
| 175 |
-
parse_array = np.array(im_parse)
|
| 176 |
-
|
| 177 |
-
# Load pose points
|
| 178 |
-
pose_name = im_name.replace('_0.jpg', '_2.json')
|
| 179 |
-
with open(os.path.join(self.dataroot, 'keypoints', pose_name), 'r') as f:
|
| 180 |
-
pose_label = json.load(f)
|
| 181 |
-
pose_data = pose_label['keypoints']
|
| 182 |
-
pose_data = np.array(pose_data)
|
| 183 |
-
pose_data = pose_data.reshape((-1, 4))
|
| 184 |
-
|
| 185 |
-
point_num = pose_data.shape[0]
|
| 186 |
-
pose_map = torch.zeros(point_num, self.height, self.width)
|
| 187 |
-
r = self.radius * (self.height / 512.0)
|
| 188 |
-
for i in range(point_num):
|
| 189 |
-
one_map = Image.new('L', (self.width, self.height))
|
| 190 |
-
draw = ImageDraw.Draw(one_map)
|
| 191 |
-
point_x = np.multiply(pose_data[i, 0], self.width / 384.0)
|
| 192 |
-
point_y = np.multiply(pose_data[i, 1], self.height / 512.0)
|
| 193 |
-
if point_x > 1 and point_y > 1:
|
| 194 |
-
draw.rectangle((point_x - r, point_y - r, point_x + r, point_y + r), 'white', 'white')
|
| 195 |
-
one_map = self.toTensor(one_map)
|
| 196 |
-
pose_map[i] = one_map[0]
|
| 197 |
-
|
| 198 |
-
agnostic_mask = self.get_agnostic(parse_array, pose_data, self.category, (self.width,self.height))
|
| 199 |
-
# agnostic_mask = transforms.functional.resize(agnostic_mask, (self.height, self.width),
|
| 200 |
-
# interpolation=transforms.InterpolationMode.NEAREST)
|
| 201 |
-
|
| 202 |
-
mask = 1 - agnostic_mask
|
| 203 |
-
im_mask = image * agnostic_mask
|
| 204 |
-
|
| 205 |
-
pose_img = Image.open(
|
| 206 |
-
os.path.join(self.dataroot, "image-densepose", im_name)
|
| 207 |
-
)
|
| 208 |
-
pose_img = self.transform(pose_img) # [-1,1]
|
| 209 |
-
|
| 210 |
-
result = {}
|
| 211 |
-
result["c_name"] = c_name
|
| 212 |
-
result["im_name"] = im_name
|
| 213 |
-
result["image"] = image
|
| 214 |
-
result["cloth_pure"] = self.transform(cloth)
|
| 215 |
-
result["cloth"] = self.clip_processor(images=cloth, return_tensors="pt").pixel_values
|
| 216 |
-
result["inpaint_mask"] =1-mask
|
| 217 |
-
result["im_mask"] = im_mask
|
| 218 |
-
result["caption_cloth"] = "a photo of " + cloth_annotation
|
| 219 |
-
result["caption"] = "model is wearing a " + cloth_annotation
|
| 220 |
-
result["pose_img"] = pose_img
|
| 221 |
-
|
| 222 |
-
return result
|
| 223 |
-
|
| 224 |
-
def __len__(self):
|
| 225 |
-
# model images + cloth image
|
| 226 |
-
return len(self.im_names)
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
def get_agnostic(self,parse_array, pose_data, category, size):
|
| 232 |
-
parse_shape = (parse_array > 0).astype(np.float32)
|
| 233 |
-
|
| 234 |
-
parse_head = (parse_array == 1).astype(np.float32) + \
|
| 235 |
-
(parse_array == 2).astype(np.float32) + \
|
| 236 |
-
(parse_array == 3).astype(np.float32) + \
|
| 237 |
-
(parse_array == 11).astype(np.float32)
|
| 238 |
-
|
| 239 |
-
parser_mask_fixed = (parse_array == label_map["hair"]).astype(np.float32) + \
|
| 240 |
-
(parse_array == label_map["left_shoe"]).astype(np.float32) + \
|
| 241 |
-
(parse_array == label_map["right_shoe"]).astype(np.float32) + \
|
| 242 |
-
(parse_array == label_map["hat"]).astype(np.float32) + \
|
| 243 |
-
(parse_array == label_map["sunglasses"]).astype(np.float32) + \
|
| 244 |
-
(parse_array == label_map["scarf"]).astype(np.float32) + \
|
| 245 |
-
(parse_array == label_map["bag"]).astype(np.float32)
|
| 246 |
-
|
| 247 |
-
parser_mask_changeable = (parse_array == label_map["background"]).astype(np.float32)
|
| 248 |
-
|
| 249 |
-
arms = (parse_array == 14).astype(np.float32) + (parse_array == 15).astype(np.float32)
|
| 250 |
-
|
| 251 |
-
if category == 'dresses':
|
| 252 |
-
label_cat = 7
|
| 253 |
-
parse_mask = (parse_array == 7).astype(np.float32) + \
|
| 254 |
-
(parse_array == 12).astype(np.float32) + \
|
| 255 |
-
(parse_array == 13).astype(np.float32)
|
| 256 |
-
parser_mask_changeable += np.logical_and(parse_array, np.logical_not(parser_mask_fixed))
|
| 257 |
-
|
| 258 |
-
elif category == 'upper_body':
|
| 259 |
-
label_cat = 4
|
| 260 |
-
parse_mask = (parse_array == 4).astype(np.float32)
|
| 261 |
-
|
| 262 |
-
parser_mask_fixed += (parse_array == label_map["skirt"]).astype(np.float32) + \
|
| 263 |
-
(parse_array == label_map["pants"]).astype(np.float32)
|
| 264 |
-
|
| 265 |
-
parser_mask_changeable += np.logical_and(parse_array, np.logical_not(parser_mask_fixed))
|
| 266 |
-
elif category == 'lower_body':
|
| 267 |
-
label_cat = 6
|
| 268 |
-
parse_mask = (parse_array == 6).astype(np.float32) + \
|
| 269 |
-
(parse_array == 12).astype(np.float32) + \
|
| 270 |
-
(parse_array == 13).astype(np.float32)
|
| 271 |
-
|
| 272 |
-
parser_mask_fixed += (parse_array == label_map["upper_clothes"]).astype(np.float32) + \
|
| 273 |
-
(parse_array == 14).astype(np.float32) + \
|
| 274 |
-
(parse_array == 15).astype(np.float32)
|
| 275 |
-
parser_mask_changeable += np.logical_and(parse_array, np.logical_not(parser_mask_fixed))
|
| 276 |
-
|
| 277 |
-
parse_head = torch.from_numpy(parse_head) # [0,1]
|
| 278 |
-
parse_mask = torch.from_numpy(parse_mask) # [0,1]
|
| 279 |
-
parser_mask_fixed = torch.from_numpy(parser_mask_fixed)
|
| 280 |
-
parser_mask_changeable = torch.from_numpy(parser_mask_changeable)
|
| 281 |
-
|
| 282 |
-
# dilation
|
| 283 |
-
parse_without_cloth = np.logical_and(parse_shape, np.logical_not(parse_mask))
|
| 284 |
-
parse_mask = parse_mask.cpu().numpy()
|
| 285 |
-
|
| 286 |
-
width = size[0]
|
| 287 |
-
height = size[1]
|
| 288 |
-
|
| 289 |
-
im_arms = Image.new('L', (width, height))
|
| 290 |
-
arms_draw = ImageDraw.Draw(im_arms)
|
| 291 |
-
if category == 'dresses' or category == 'upper_body':
|
| 292 |
-
shoulder_right = tuple(np.multiply(pose_data[2, :2], height / 512.0))
|
| 293 |
-
shoulder_left = tuple(np.multiply(pose_data[5, :2], height / 512.0))
|
| 294 |
-
elbow_right = tuple(np.multiply(pose_data[3, :2], height / 512.0))
|
| 295 |
-
elbow_left = tuple(np.multiply(pose_data[6, :2], height / 512.0))
|
| 296 |
-
wrist_right = tuple(np.multiply(pose_data[4, :2], height / 512.0))
|
| 297 |
-
wrist_left = tuple(np.multiply(pose_data[7, :2], height / 512.0))
|
| 298 |
-
if wrist_right[0] <= 1. and wrist_right[1] <= 1.:
|
| 299 |
-
if elbow_right[0] <= 1. and elbow_right[1] <= 1.:
|
| 300 |
-
arms_draw.line([wrist_left, elbow_left, shoulder_left, shoulder_right], 'white', 30, 'curve')
|
| 301 |
-
else:
|
| 302 |
-
arms_draw.line([wrist_left, elbow_left, shoulder_left, shoulder_right, elbow_right], 'white', 30,
|
| 303 |
-
'curve')
|
| 304 |
-
elif wrist_left[0] <= 1. and wrist_left[1] <= 1.:
|
| 305 |
-
if elbow_left[0] <= 1. and elbow_left[1] <= 1.:
|
| 306 |
-
arms_draw.line([shoulder_left, shoulder_right, elbow_right, wrist_right], 'white', 30, 'curve')
|
| 307 |
-
else:
|
| 308 |
-
arms_draw.line([elbow_left, shoulder_left, shoulder_right, elbow_right, wrist_right], 'white', 30,
|
| 309 |
-
'curve')
|
| 310 |
-
else:
|
| 311 |
-
arms_draw.line([wrist_left, elbow_left, shoulder_left, shoulder_right, elbow_right, wrist_right], 'white',
|
| 312 |
-
30, 'curve')
|
| 313 |
-
|
| 314 |
-
if height > 512:
|
| 315 |
-
im_arms = cv2.dilate(np.float32(im_arms), np.ones((10, 10), np.uint16), iterations=5)
|
| 316 |
-
elif height > 256:
|
| 317 |
-
im_arms = cv2.dilate(np.float32(im_arms), np.ones((5, 5), np.uint16), iterations=5)
|
| 318 |
-
hands = np.logical_and(np.logical_not(im_arms), arms)
|
| 319 |
-
parse_mask += im_arms
|
| 320 |
-
parser_mask_fixed += hands
|
| 321 |
-
|
| 322 |
-
# delete neck
|
| 323 |
-
parse_head_2 = torch.clone(parse_head)
|
| 324 |
-
if category == 'dresses' or category == 'upper_body':
|
| 325 |
-
points = []
|
| 326 |
-
points.append(np.multiply(pose_data[2, :2], height / 512.0))
|
| 327 |
-
points.append(np.multiply(pose_data[5, :2], height / 512.0))
|
| 328 |
-
x_coords, y_coords = zip(*points)
|
| 329 |
-
A = np.vstack([x_coords, np.ones(len(x_coords))]).T
|
| 330 |
-
m, c = lstsq(A, y_coords, rcond=None)[0]
|
| 331 |
-
for i in range(parse_array.shape[1]):
|
| 332 |
-
y = i * m + c
|
| 333 |
-
parse_head_2[int(y - 20 * (height / 512.0)):, i] = 0
|
| 334 |
-
|
| 335 |
-
parser_mask_fixed = np.logical_or(parser_mask_fixed, np.array(parse_head_2, dtype=np.uint16))
|
| 336 |
-
parse_mask += np.logical_or(parse_mask, np.logical_and(np.array(parse_head, dtype=np.uint16),
|
| 337 |
-
np.logical_not(np.array(parse_head_2, dtype=np.uint16))))
|
| 338 |
-
|
| 339 |
-
if height > 512:
|
| 340 |
-
parse_mask = cv2.dilate(parse_mask, np.ones((20, 20), np.uint16), iterations=5)
|
| 341 |
-
elif height > 256:
|
| 342 |
-
parse_mask = cv2.dilate(parse_mask, np.ones((10, 10), np.uint16), iterations=5)
|
| 343 |
-
else:
|
| 344 |
-
parse_mask = cv2.dilate(parse_mask, np.ones((5, 5), np.uint16), iterations=5)
|
| 345 |
-
parse_mask = np.logical_and(parser_mask_changeable, np.logical_not(parse_mask))
|
| 346 |
-
parse_mask_total = np.logical_or(parse_mask, parser_mask_fixed)
|
| 347 |
-
agnostic_mask = parse_mask_total.unsqueeze(0)
|
| 348 |
-
return agnostic_mask
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
def main():
|
| 354 |
-
args = parse_args()
|
| 355 |
-
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir)
|
| 356 |
-
accelerator = Accelerator(
|
| 357 |
-
mixed_precision=args.mixed_precision,
|
| 358 |
-
project_config=accelerator_project_config,
|
| 359 |
-
)
|
| 360 |
-
if accelerator.is_local_main_process:
|
| 361 |
-
transformers.utils.logging.set_verbosity_warning()
|
| 362 |
-
diffusers.utils.logging.set_verbosity_info()
|
| 363 |
-
else:
|
| 364 |
-
transformers.utils.logging.set_verbosity_error()
|
| 365 |
-
diffusers.utils.logging.set_verbosity_error()
|
| 366 |
-
# If passed along, set the training seed now.
|
| 367 |
-
if args.seed is not None:
|
| 368 |
-
set_seed(args.seed)
|
| 369 |
-
|
| 370 |
-
# Handle the repository creation
|
| 371 |
-
if accelerator.is_main_process:
|
| 372 |
-
if args.output_dir is not None:
|
| 373 |
-
os.makedirs(args.output_dir, exist_ok=True)
|
| 374 |
-
|
| 375 |
-
weight_dtype = torch.float16
|
| 376 |
-
# if accelerator.mixed_precision == "fp16":
|
| 377 |
-
# weight_dtype = torch.float16
|
| 378 |
-
# args.mixed_precision = accelerator.mixed_precision
|
| 379 |
-
# elif accelerator.mixed_precision == "bf16":
|
| 380 |
-
# weight_dtype = torch.bfloat16
|
| 381 |
-
# args.mixed_precision = accelerator.mixed_precision
|
| 382 |
-
|
| 383 |
-
# Load scheduler, tokenizer and models.
|
| 384 |
-
noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
|
| 385 |
-
vae = AutoencoderKL.from_pretrained(
|
| 386 |
-
args.pretrained_model_name_or_path,
|
| 387 |
-
subfolder="vae",
|
| 388 |
-
torch_dtype=torch.float16,
|
| 389 |
-
)
|
| 390 |
-
unet = UNet2DConditionModel.from_pretrained(
|
| 391 |
-
"yisol/IDM-VTON-DC",
|
| 392 |
-
subfolder="unet",
|
| 393 |
-
torch_dtype=torch.float16,
|
| 394 |
-
)
|
| 395 |
-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
| 396 |
-
args.pretrained_model_name_or_path,
|
| 397 |
-
subfolder="image_encoder",
|
| 398 |
-
torch_dtype=torch.float16,
|
| 399 |
-
)
|
| 400 |
-
UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
|
| 401 |
-
args.pretrained_model_name_or_path,
|
| 402 |
-
subfolder="unet_encoder",
|
| 403 |
-
torch_dtype=torch.float16,
|
| 404 |
-
)
|
| 405 |
-
text_encoder_one = CLIPTextModel.from_pretrained(
|
| 406 |
-
args.pretrained_model_name_or_path,
|
| 407 |
-
subfolder="text_encoder",
|
| 408 |
-
torch_dtype=torch.float16,
|
| 409 |
-
)
|
| 410 |
-
text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
|
| 411 |
-
args.pretrained_model_name_or_path,
|
| 412 |
-
subfolder="text_encoder_2",
|
| 413 |
-
torch_dtype=torch.float16,
|
| 414 |
-
)
|
| 415 |
-
tokenizer_one = AutoTokenizer.from_pretrained(
|
| 416 |
-
args.pretrained_model_name_or_path,
|
| 417 |
-
subfolder="tokenizer",
|
| 418 |
-
revision=None,
|
| 419 |
-
use_fast=False,
|
| 420 |
-
)
|
| 421 |
-
tokenizer_two = AutoTokenizer.from_pretrained(
|
| 422 |
-
args.pretrained_model_name_or_path,
|
| 423 |
-
subfolder="tokenizer_2",
|
| 424 |
-
revision=None,
|
| 425 |
-
use_fast=False,
|
| 426 |
-
)
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
# Freeze vae and text_encoder and set unet to trainable
|
| 430 |
-
unet.requires_grad_(False)
|
| 431 |
-
vae.requires_grad_(False)
|
| 432 |
-
image_encoder.requires_grad_(False)
|
| 433 |
-
UNet_Encoder.requires_grad_(False)
|
| 434 |
-
text_encoder_one.requires_grad_(False)
|
| 435 |
-
text_encoder_two.requires_grad_(False)
|
| 436 |
-
UNet_Encoder.to(accelerator.device, weight_dtype)
|
| 437 |
-
unet.eval()
|
| 438 |
-
UNet_Encoder.eval()
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
if args.enable_xformers_memory_efficient_attention:
|
| 443 |
-
if is_xformers_available():
|
| 444 |
-
import xformers
|
| 445 |
-
|
| 446 |
-
xformers_version = version.parse(xformers.__version__)
|
| 447 |
-
if xformers_version == version.parse("0.0.16"):
|
| 448 |
-
logger.warn(
|
| 449 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
| 450 |
-
)
|
| 451 |
-
unet.enable_xformers_memory_efficient_attention()
|
| 452 |
-
else:
|
| 453 |
-
raise ValueError("xformers is not available. Make sure it is installed correctly")
|
| 454 |
-
|
| 455 |
-
test_dataset = DresscodeTestDataset(
|
| 456 |
-
dataroot_path=args.data_dir,
|
| 457 |
-
phase="test",
|
| 458 |
-
order="unpaired" if args.unpaired else "paired",
|
| 459 |
-
category = args.category,
|
| 460 |
-
size=(args.height, args.width),
|
| 461 |
-
)
|
| 462 |
-
test_dataloader = torch.utils.data.DataLoader(
|
| 463 |
-
test_dataset,
|
| 464 |
-
shuffle=False,
|
| 465 |
-
batch_size=args.test_batch_size,
|
| 466 |
-
num_workers=4,
|
| 467 |
-
)
|
| 468 |
-
|
| 469 |
-
pipe = TryonPipeline.from_pretrained(
|
| 470 |
-
args.pretrained_model_name_or_path,
|
| 471 |
-
unet=unet,
|
| 472 |
-
vae=vae,
|
| 473 |
-
feature_extractor= CLIPImageProcessor(),
|
| 474 |
-
text_encoder = text_encoder_one,
|
| 475 |
-
text_encoder_2 = text_encoder_two,
|
| 476 |
-
tokenizer = tokenizer_one,
|
| 477 |
-
tokenizer_2 = tokenizer_two,
|
| 478 |
-
scheduler = noise_scheduler,
|
| 479 |
-
image_encoder=image_encoder,
|
| 480 |
-
torch_dtype=torch.float16,
|
| 481 |
-
).to(accelerator.device)
|
| 482 |
-
pipe.unet_encoder = UNet_Encoder
|
| 483 |
-
|
| 484 |
-
# pipe.enable_sequential_cpu_offload()
|
| 485 |
-
# pipe.enable_model_cpu_offload()
|
| 486 |
-
# pipe.enable_vae_slicing()
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
with torch.no_grad():
|
| 491 |
-
# Extract the images
|
| 492 |
-
with torch.cuda.amp.autocast():
|
| 493 |
-
with torch.no_grad():
|
| 494 |
-
for sample in test_dataloader:
|
| 495 |
-
img_emb_list = []
|
| 496 |
-
for i in range(sample['cloth'].shape[0]):
|
| 497 |
-
img_emb_list.append(sample['cloth'][i])
|
| 498 |
-
|
| 499 |
-
prompt = sample["caption"]
|
| 500 |
-
|
| 501 |
-
num_prompts = sample['cloth'].shape[0]
|
| 502 |
-
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
| 503 |
-
|
| 504 |
-
if not isinstance(prompt, List):
|
| 505 |
-
prompt = [prompt] * num_prompts
|
| 506 |
-
if not isinstance(negative_prompt, List):
|
| 507 |
-
negative_prompt = [negative_prompt] * num_prompts
|
| 508 |
-
|
| 509 |
-
image_embeds = torch.cat(img_emb_list,dim=0)
|
| 510 |
-
|
| 511 |
-
with torch.inference_mode():
|
| 512 |
-
(
|
| 513 |
-
prompt_embeds,
|
| 514 |
-
negative_prompt_embeds,
|
| 515 |
-
pooled_prompt_embeds,
|
| 516 |
-
negative_pooled_prompt_embeds,
|
| 517 |
-
) = pipe.encode_prompt(
|
| 518 |
-
prompt,
|
| 519 |
-
num_images_per_prompt=1,
|
| 520 |
-
do_classifier_free_guidance=True,
|
| 521 |
-
negative_prompt=negative_prompt,
|
| 522 |
-
)
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
prompt = sample["caption_cloth"]
|
| 526 |
-
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
| 527 |
-
|
| 528 |
-
if not isinstance(prompt, List):
|
| 529 |
-
prompt = [prompt] * num_prompts
|
| 530 |
-
if not isinstance(negative_prompt, List):
|
| 531 |
-
negative_prompt = [negative_prompt] * num_prompts
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
with torch.inference_mode():
|
| 535 |
-
(
|
| 536 |
-
prompt_embeds_c,
|
| 537 |
-
_,
|
| 538 |
-
_,
|
| 539 |
-
_,
|
| 540 |
-
) = pipe.encode_prompt(
|
| 541 |
-
prompt,
|
| 542 |
-
num_images_per_prompt=1,
|
| 543 |
-
do_classifier_free_guidance=False,
|
| 544 |
-
negative_prompt=negative_prompt,
|
| 545 |
-
)
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
generator = torch.Generator(pipe.device).manual_seed(args.seed) if args.seed is not None else None
|
| 550 |
-
images = pipe(
|
| 551 |
-
prompt_embeds=prompt_embeds,
|
| 552 |
-
negative_prompt_embeds=negative_prompt_embeds,
|
| 553 |
-
pooled_prompt_embeds=pooled_prompt_embeds,
|
| 554 |
-
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
| 555 |
-
num_inference_steps=args.num_inference_steps,
|
| 556 |
-
generator=generator,
|
| 557 |
-
strength = 1.0,
|
| 558 |
-
pose_img = sample['pose_img'],
|
| 559 |
-
text_embeds_cloth=prompt_embeds_c,
|
| 560 |
-
cloth = sample["cloth_pure"].to(accelerator.device),
|
| 561 |
-
mask_image=sample['inpaint_mask'],
|
| 562 |
-
image=(sample['image']+1.0)/2.0,
|
| 563 |
-
height=args.height,
|
| 564 |
-
width=args.width,
|
| 565 |
-
guidance_scale=args.guidance_scale,
|
| 566 |
-
ip_adapter_image = image_embeds,
|
| 567 |
-
)[0]
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
for i in range(len(images)):
|
| 571 |
-
x_sample = pil_to_tensor(images[i])
|
| 572 |
-
torchvision.utils.save_image(x_sample,os.path.join(args.output_dir,sample['im_name'][i]))
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
if __name__ == "__main__":
|
| 578 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openpose/ckpts/body_pose_model.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:25a948c16078b0f08e236bda51a385d855ef4c153598947c28c0d47ed94bb746
|
| 3 |
-
size 209267595
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -2,9 +2,9 @@ transformers==4.36.2
|
|
| 2 |
torch==2.0.1
|
| 3 |
torchvision==0.15.2
|
| 4 |
torchaudio==2.0.2
|
| 5 |
-
numpy
|
| 6 |
-
scipy
|
| 7 |
-
scikit-image
|
| 8 |
opencv-python==4.7.0.72
|
| 9 |
pillow==9.4.0
|
| 10 |
diffusers==0.25.0
|
|
@@ -20,4 +20,4 @@ av
|
|
| 20 |
fvcore
|
| 21 |
cloudpickle
|
| 22 |
omegaconf
|
| 23 |
-
pycocotools
|
|
|
|
| 2 |
torch==2.0.1
|
| 3 |
torchvision==0.15.2
|
| 4 |
torchaudio==2.0.2
|
| 5 |
+
numpy==1.24.4
|
| 6 |
+
scipy==1.10.1
|
| 7 |
+
scikit-image==0.21.0
|
| 8 |
opencv-python==4.7.0.72
|
| 9 |
pillow==9.4.0
|
| 10 |
diffusers==0.25.0
|
|
|
|
| 20 |
fvcore
|
| 21 |
cloudpickle
|
| 22 |
omegaconf
|
| 23 |
+
pycocotools
|
scheduler/scheduler_config.json
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_class_name": "DDPMScheduler",
|
| 3 |
-
"_diffusers_version": "0.21.0.dev0",
|
| 4 |
-
"beta_end": 0.012,
|
| 5 |
-
"beta_schedule": "scaled_linear",
|
| 6 |
-
"beta_start": 0.00085,
|
| 7 |
-
"clip_sample": false,
|
| 8 |
-
"interpolation_type": "linear",
|
| 9 |
-
"num_train_timesteps": 1000,
|
| 10 |
-
"prediction_type": "epsilon",
|
| 11 |
-
"sample_max_value": 1.0,
|
| 12 |
-
"set_alpha_to_one": false,
|
| 13 |
-
"skip_prk_steps": true,
|
| 14 |
-
"steps_offset": 1,
|
| 15 |
-
"timestep_spacing": "leading",
|
| 16 |
-
"trained_betas": null,
|
| 17 |
-
"use_karras_sigmas": false,
|
| 18 |
-
"rescale_betas_zero_snr": true
|
| 19 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_encoder/config.json
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--stabilityai--stable-diffusion-xl-base-1.0/snapshots/bf714989e22c57ddc1c453bf74dab4521acb81d8/text_encoder",
|
| 3 |
-
"architectures": [
|
| 4 |
-
"CLIPTextModel"
|
| 5 |
-
],
|
| 6 |
-
"attention_dropout": 0.0,
|
| 7 |
-
"bos_token_id": 0,
|
| 8 |
-
"dropout": 0.0,
|
| 9 |
-
"eos_token_id": 2,
|
| 10 |
-
"hidden_act": "quick_gelu",
|
| 11 |
-
"hidden_size": 768,
|
| 12 |
-
"initializer_factor": 1.0,
|
| 13 |
-
"initializer_range": 0.02,
|
| 14 |
-
"intermediate_size": 3072,
|
| 15 |
-
"layer_norm_eps": 1e-05,
|
| 16 |
-
"max_position_embeddings": 77,
|
| 17 |
-
"model_type": "clip_text_model",
|
| 18 |
-
"num_attention_heads": 12,
|
| 19 |
-
"num_hidden_layers": 12,
|
| 20 |
-
"pad_token_id": 1,
|
| 21 |
-
"projection_dim": 768,
|
| 22 |
-
"torch_dtype": "float16",
|
| 23 |
-
"transformers_version": "4.29.2",
|
| 24 |
-
"vocab_size": 49408
|
| 25 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_encoder/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:79f531155c765c22c89e23328793a2e91a1178070af961c57e2eae5f0509b65b
|
| 3 |
-
size 492265879
|
|
|
|
|
|
|
|
|
|
|
|
text_encoder_2/config.json
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--stabilityai--stable-diffusion-xl-base-1.0/snapshots/bf714989e22c57ddc1c453bf74dab4521acb81d8/text_encoder_2",
|
| 3 |
-
"architectures": [
|
| 4 |
-
"CLIPTextModelWithProjection"
|
| 5 |
-
],
|
| 6 |
-
"attention_dropout": 0.0,
|
| 7 |
-
"bos_token_id": 0,
|
| 8 |
-
"dropout": 0.0,
|
| 9 |
-
"eos_token_id": 2,
|
| 10 |
-
"hidden_act": "gelu",
|
| 11 |
-
"hidden_size": 1280,
|
| 12 |
-
"initializer_factor": 1.0,
|
| 13 |
-
"initializer_range": 0.02,
|
| 14 |
-
"intermediate_size": 5120,
|
| 15 |
-
"layer_norm_eps": 1e-05,
|
| 16 |
-
"max_position_embeddings": 77,
|
| 17 |
-
"model_type": "clip_text_model",
|
| 18 |
-
"num_attention_heads": 20,
|
| 19 |
-
"num_hidden_layers": 32,
|
| 20 |
-
"pad_token_id": 1,
|
| 21 |
-
"projection_dim": 1280,
|
| 22 |
-
"torch_dtype": "float16",
|
| 23 |
-
"transformers_version": "4.29.2",
|
| 24 |
-
"vocab_size": 49408
|
| 25 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_encoder_2/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:283bb90f987a133dec11947571aca17692ed32f3fff708441ac8eedcfa4a040e
|
| 3 |
-
size 2778702976
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer/merges.txt
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer/special_tokens_map.json
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"bos_token": {
|
| 3 |
-
"content": "<|startoftext|>",
|
| 4 |
-
"lstrip": false,
|
| 5 |
-
"normalized": true,
|
| 6 |
-
"rstrip": false,
|
| 7 |
-
"single_word": false
|
| 8 |
-
},
|
| 9 |
-
"eos_token": {
|
| 10 |
-
"content": "<|endoftext|>",
|
| 11 |
-
"lstrip": false,
|
| 12 |
-
"normalized": true,
|
| 13 |
-
"rstrip": false,
|
| 14 |
-
"single_word": false
|
| 15 |
-
},
|
| 16 |
-
"pad_token": "<|endoftext|>",
|
| 17 |
-
"unk_token": {
|
| 18 |
-
"content": "<|endoftext|>",
|
| 19 |
-
"lstrip": false,
|
| 20 |
-
"normalized": true,
|
| 21 |
-
"rstrip": false,
|
| 22 |
-
"single_word": false
|
| 23 |
-
}
|
| 24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer/tokenizer_config.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"add_prefix_space": false,
|
| 3 |
-
"bos_token": {
|
| 4 |
-
"__type": "AddedToken",
|
| 5 |
-
"content": "<|startoftext|>",
|
| 6 |
-
"lstrip": false,
|
| 7 |
-
"normalized": true,
|
| 8 |
-
"rstrip": false,
|
| 9 |
-
"single_word": false
|
| 10 |
-
},
|
| 11 |
-
"clean_up_tokenization_spaces": true,
|
| 12 |
-
"do_lower_case": true,
|
| 13 |
-
"eos_token": {
|
| 14 |
-
"__type": "AddedToken",
|
| 15 |
-
"content": "<|endoftext|>",
|
| 16 |
-
"lstrip": false,
|
| 17 |
-
"normalized": true,
|
| 18 |
-
"rstrip": false,
|
| 19 |
-
"single_word": false
|
| 20 |
-
},
|
| 21 |
-
"errors": "replace",
|
| 22 |
-
"model_max_length": 77,
|
| 23 |
-
"pad_token": "<|endoftext|>",
|
| 24 |
-
"tokenizer_class": "CLIPTokenizer",
|
| 25 |
-
"unk_token": {
|
| 26 |
-
"__type": "AddedToken",
|
| 27 |
-
"content": "<|endoftext|>",
|
| 28 |
-
"lstrip": false,
|
| 29 |
-
"normalized": true,
|
| 30 |
-
"rstrip": false,
|
| 31 |
-
"single_word": false
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer/vocab.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_2/merges.txt
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_2/special_tokens_map.json
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"bos_token": {
|
| 3 |
-
"content": "<|startoftext|>",
|
| 4 |
-
"lstrip": false,
|
| 5 |
-
"normalized": true,
|
| 6 |
-
"rstrip": false,
|
| 7 |
-
"single_word": false
|
| 8 |
-
},
|
| 9 |
-
"eos_token": {
|
| 10 |
-
"content": "<|endoftext|>",
|
| 11 |
-
"lstrip": false,
|
| 12 |
-
"normalized": true,
|
| 13 |
-
"rstrip": false,
|
| 14 |
-
"single_word": false
|
| 15 |
-
},
|
| 16 |
-
"pad_token": "!",
|
| 17 |
-
"unk_token": {
|
| 18 |
-
"content": "<|endoftext|>",
|
| 19 |
-
"lstrip": false,
|
| 20 |
-
"normalized": true,
|
| 21 |
-
"rstrip": false,
|
| 22 |
-
"single_word": false
|
| 23 |
-
}
|
| 24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_2/tokenizer_config.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"add_prefix_space": false,
|
| 3 |
-
"bos_token": {
|
| 4 |
-
"__type": "AddedToken",
|
| 5 |
-
"content": "<|startoftext|>",
|
| 6 |
-
"lstrip": false,
|
| 7 |
-
"normalized": true,
|
| 8 |
-
"rstrip": false,
|
| 9 |
-
"single_word": false
|
| 10 |
-
},
|
| 11 |
-
"clean_up_tokenization_spaces": true,
|
| 12 |
-
"do_lower_case": true,
|
| 13 |
-
"eos_token": {
|
| 14 |
-
"__type": "AddedToken",
|
| 15 |
-
"content": "<|endoftext|>",
|
| 16 |
-
"lstrip": false,
|
| 17 |
-
"normalized": true,
|
| 18 |
-
"rstrip": false,
|
| 19 |
-
"single_word": false
|
| 20 |
-
},
|
| 21 |
-
"errors": "replace",
|
| 22 |
-
"model_max_length": 77,
|
| 23 |
-
"pad_token": "!",
|
| 24 |
-
"tokenizer_class": "CLIPTokenizer",
|
| 25 |
-
"unk_token": {
|
| 26 |
-
"__type": "AddedToken",
|
| 27 |
-
"content": "<|endoftext|>",
|
| 28 |
-
"lstrip": false,
|
| 29 |
-
"normalized": true,
|
| 30 |
-
"rstrip": false,
|
| 31 |
-
"single_word": false
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_2/vocab.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
unet/config.json
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_class_name": "UNet2DConditionModel",
|
| 3 |
-
"_diffusers_version": "0.25.0",
|
| 4 |
-
"_name_or_path": "valhalla/sdxl-inpaint-ema",
|
| 5 |
-
"act_fn": "silu",
|
| 6 |
-
"addition_embed_type": "text_time",
|
| 7 |
-
"addition_embed_type_num_heads": 64,
|
| 8 |
-
"addition_time_embed_dim": 256,
|
| 9 |
-
"attention_head_dim": [
|
| 10 |
-
5,
|
| 11 |
-
10,
|
| 12 |
-
20
|
| 13 |
-
],
|
| 14 |
-
"attention_type": "default",
|
| 15 |
-
"block_out_channels": [
|
| 16 |
-
320,
|
| 17 |
-
640,
|
| 18 |
-
1280
|
| 19 |
-
],
|
| 20 |
-
"center_input_sample": false,
|
| 21 |
-
"class_embed_type": null,
|
| 22 |
-
"class_embeddings_concat": false,
|
| 23 |
-
"conv_in_kernel": 3,
|
| 24 |
-
"conv_out_kernel": 3,
|
| 25 |
-
"cross_attention_dim": 2048,
|
| 26 |
-
"cross_attention_norm": null,
|
| 27 |
-
"decay": 0.9999,
|
| 28 |
-
"down_block_types": [
|
| 29 |
-
"DownBlock2D",
|
| 30 |
-
"CrossAttnDownBlock2D",
|
| 31 |
-
"CrossAttnDownBlock2D"
|
| 32 |
-
],
|
| 33 |
-
"downsample_padding": 1,
|
| 34 |
-
"dual_cross_attention": false,
|
| 35 |
-
"encoder_hid_dim": 1280,
|
| 36 |
-
"encoder_hid_dim_type": "ip_image_proj",
|
| 37 |
-
"flip_sin_to_cos": true,
|
| 38 |
-
"freq_shift": 0,
|
| 39 |
-
"in_channels": 13,
|
| 40 |
-
"inv_gamma": 1.0,
|
| 41 |
-
"layers_per_block": 2,
|
| 42 |
-
"mid_block_only_cross_attention": null,
|
| 43 |
-
"mid_block_scale_factor": 1,
|
| 44 |
-
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
| 45 |
-
"min_decay": 0.0,
|
| 46 |
-
"norm_eps": 1e-05,
|
| 47 |
-
"norm_num_groups": 32,
|
| 48 |
-
"num_attention_heads": null,
|
| 49 |
-
"num_class_embeds": null,
|
| 50 |
-
"only_cross_attention": false,
|
| 51 |
-
"optimization_step": 37000,
|
| 52 |
-
"out_channels": 4,
|
| 53 |
-
"power": 0.6666666666666666,
|
| 54 |
-
"projection_class_embeddings_input_dim": 2816,
|
| 55 |
-
"resnet_out_scale_factor": 1.0,
|
| 56 |
-
"resnet_skip_time_act": false,
|
| 57 |
-
"resnet_time_scale_shift": "default",
|
| 58 |
-
"sample_size": 128,
|
| 59 |
-
"time_cond_proj_dim": null,
|
| 60 |
-
"time_embedding_act_fn": null,
|
| 61 |
-
"time_embedding_dim": null,
|
| 62 |
-
"time_embedding_type": "positional",
|
| 63 |
-
"timestep_post_act": null,
|
| 64 |
-
"transformer_layers_per_block": [
|
| 65 |
-
1,
|
| 66 |
-
2,
|
| 67 |
-
10
|
| 68 |
-
],
|
| 69 |
-
"up_block_types": [
|
| 70 |
-
"CrossAttnUpBlock2D",
|
| 71 |
-
"CrossAttnUpBlock2D",
|
| 72 |
-
"UpBlock2D"
|
| 73 |
-
],
|
| 74 |
-
"upcast_attention": null,
|
| 75 |
-
"update_after_step": 0,
|
| 76 |
-
"use_ema_warmup": false,
|
| 77 |
-
"use_linear_projection": true
|
| 78 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unet/diffusion_pytorch_model.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:046b775cb9bbc67635fc3b148bb03bfe00496ce2f9ce8488a82fdb388669a521
|
| 3 |
-
size 11965769774
|
|
|
|
|
|
|
|
|
|
|
|
unet_encoder/config.json
DELETED
|
@@ -1,68 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_class_name": "UNet2DConditionModel",
|
| 3 |
-
"_diffusers_version": "0.19.0.dev0",
|
| 4 |
-
"act_fn": "silu",
|
| 5 |
-
"addition_embed_type_num_heads": 64,
|
| 6 |
-
"addition_time_embed_dim": 256,
|
| 7 |
-
"attention_head_dim": [
|
| 8 |
-
5,
|
| 9 |
-
10,
|
| 10 |
-
20
|
| 11 |
-
],
|
| 12 |
-
"block_out_channels": [
|
| 13 |
-
320,
|
| 14 |
-
640,
|
| 15 |
-
1280
|
| 16 |
-
],
|
| 17 |
-
"center_input_sample": false,
|
| 18 |
-
"class_embed_type": null,
|
| 19 |
-
"class_embeddings_concat": false,
|
| 20 |
-
"conv_in_kernel": 3,
|
| 21 |
-
"conv_out_kernel": 3,
|
| 22 |
-
"cross_attention_dim": 2048,
|
| 23 |
-
"cross_attention_norm": null,
|
| 24 |
-
"down_block_types": [
|
| 25 |
-
"DownBlock2D",
|
| 26 |
-
"CrossAttnDownBlock2D",
|
| 27 |
-
"CrossAttnDownBlock2D"
|
| 28 |
-
],
|
| 29 |
-
"downsample_padding": 1,
|
| 30 |
-
"dual_cross_attention": false,
|
| 31 |
-
"encoder_hid_dim": null,
|
| 32 |
-
"encoder_hid_dim_type": null,
|
| 33 |
-
"flip_sin_to_cos": true,
|
| 34 |
-
"freq_shift": 0,
|
| 35 |
-
"in_channels": 4,
|
| 36 |
-
"layers_per_block": 2,
|
| 37 |
-
"mid_block_only_cross_attention": null,
|
| 38 |
-
"mid_block_scale_factor": 1,
|
| 39 |
-
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
| 40 |
-
"norm_eps": 1e-05,
|
| 41 |
-
"norm_num_groups": 32,
|
| 42 |
-
"num_attention_heads": null,
|
| 43 |
-
"num_class_embeds": null,
|
| 44 |
-
"only_cross_attention": false,
|
| 45 |
-
"out_channels": 4,
|
| 46 |
-
"projection_class_embeddings_input_dim": 2816,
|
| 47 |
-
"resnet_out_scale_factor": 1.0,
|
| 48 |
-
"resnet_skip_time_act": false,
|
| 49 |
-
"resnet_time_scale_shift": "default",
|
| 50 |
-
"sample_size": 128,
|
| 51 |
-
"time_cond_proj_dim": null,
|
| 52 |
-
"time_embedding_act_fn": null,
|
| 53 |
-
"time_embedding_dim": null,
|
| 54 |
-
"time_embedding_type": "positional",
|
| 55 |
-
"timestep_post_act": null,
|
| 56 |
-
"transformer_layers_per_block": [
|
| 57 |
-
1,
|
| 58 |
-
2,
|
| 59 |
-
10
|
| 60 |
-
],
|
| 61 |
-
"up_block_types": [
|
| 62 |
-
"CrossAttnUpBlock2D",
|
| 63 |
-
"CrossAttnUpBlock2D",
|
| 64 |
-
"UpBlock2D"
|
| 65 |
-
],
|
| 66 |
-
"upcast_attention": null,
|
| 67 |
-
"use_linear_projection": true
|
| 68 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unet_encoder/diffusion_pytorch_model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:357650fbfb3c7b4d94c1f5fd7664da819ad1ff5a839430484b4ec422d03f710a
|
| 3 |
-
size 10270077736
|
|
|
|
|
|
|
|
|
|
|
|
util/common.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
import platform, os
|
| 2 |
-
|
| 3 |
-
def open_folder():
|
| 4 |
-
open_folder_path = os.path.abspath("outputs")
|
| 5 |
-
if platform.system() == "Windows":
|
| 6 |
-
os.startfile(open_folder_path)
|
| 7 |
-
elif platform.system() == "Linux":
|
| 8 |
-
os.system(f'xdg-open "{open_folder_path}"')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
util/image.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
|
| 3 |
-
import numpy as np
|
| 4 |
-
from PIL import Image
|
| 5 |
-
|
| 6 |
-
def save_output_image(image, base_path="outputs", base_filename="inputimage", seed=0):
|
| 7 |
-
"""Save an image with a unique filename in the specified directory."""
|
| 8 |
-
if not os.path.exists(base_path):
|
| 9 |
-
os.makedirs(base_path)
|
| 10 |
-
|
| 11 |
-
# Check for existing files and create a new filename
|
| 12 |
-
index = 0
|
| 13 |
-
while True:
|
| 14 |
-
if index == 0:
|
| 15 |
-
filename = f"{base_filename}_seed_{seed}.png"
|
| 16 |
-
else:
|
| 17 |
-
filename = f"{base_filename}_{str(index).zfill(4)}_seed_{seed}.png"
|
| 18 |
-
|
| 19 |
-
file_path = os.path.join(base_path, filename)
|
| 20 |
-
if not os.path.exists(file_path):
|
| 21 |
-
image.save(file_path)
|
| 22 |
-
break
|
| 23 |
-
index += 1
|
| 24 |
-
return file_path
|
| 25 |
-
|
| 26 |
-
def pil_to_binary_mask(pil_image, threshold=0):
|
| 27 |
-
np_image = np.array(pil_image)
|
| 28 |
-
grayscale_image = Image.fromarray(np_image).convert("L")
|
| 29 |
-
binary_mask = np.array(grayscale_image) > threshold
|
| 30 |
-
mask = np.zeros(binary_mask.shape, dtype=np.uint8)
|
| 31 |
-
for i in range(binary_mask.shape[0]):
|
| 32 |
-
for j in range(binary_mask.shape[1]):
|
| 33 |
-
if binary_mask[i,j] == True :
|
| 34 |
-
mask[i,j] = 1
|
| 35 |
-
mask = (mask*255).astype(np.uint8)
|
| 36 |
-
output_mask = Image.fromarray(mask)
|
| 37 |
-
return output_mask
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
util/pipeline.py
DELETED
|
@@ -1,88 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
import gc
|
| 3 |
-
from torch import nn
|
| 4 |
-
from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
|
| 5 |
-
import bitsandbytes as bnb
|
| 6 |
-
|
| 7 |
-
def torch_gc():
|
| 8 |
-
|
| 9 |
-
if torch.cuda.is_available():
|
| 10 |
-
with torch.cuda.device('cuda'):
|
| 11 |
-
torch.cuda.empty_cache()
|
| 12 |
-
torch.cuda.ipc_collect()
|
| 13 |
-
|
| 14 |
-
gc.collect()
|
| 15 |
-
|
| 16 |
-
def restart_cpu_offload(pipe, load_mode):
|
| 17 |
-
#if load_mode != '4bit' :
|
| 18 |
-
# pipe.disable_xformers_memory_efficient_attention()
|
| 19 |
-
optionally_disable_offloading(pipe)
|
| 20 |
-
gc.collect()
|
| 21 |
-
torch.cuda.empty_cache()
|
| 22 |
-
pipe.enable_model_cpu_offload()
|
| 23 |
-
#if load_mode != '4bit' :
|
| 24 |
-
# pipe.enable_xformers_memory_efficient_attention()
|
| 25 |
-
|
| 26 |
-
def optionally_disable_offloading(_pipeline):
|
| 27 |
-
|
| 28 |
-
"""
|
| 29 |
-
Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU.
|
| 30 |
-
|
| 31 |
-
Args:
|
| 32 |
-
_pipeline (`DiffusionPipeline`):
|
| 33 |
-
The pipeline to disable offloading for.
|
| 34 |
-
|
| 35 |
-
Returns:
|
| 36 |
-
tuple:
|
| 37 |
-
A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
|
| 38 |
-
"""
|
| 39 |
-
is_model_cpu_offload = False
|
| 40 |
-
is_sequential_cpu_offload = False
|
| 41 |
-
print(
|
| 42 |
-
fr"Restarting CPU Offloading for {_pipeline.unet_name}..."
|
| 43 |
-
)
|
| 44 |
-
if _pipeline is not None:
|
| 45 |
-
for _, component in _pipeline.components.items():
|
| 46 |
-
if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
|
| 47 |
-
if not is_model_cpu_offload:
|
| 48 |
-
is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
|
| 49 |
-
if not is_sequential_cpu_offload:
|
| 50 |
-
is_sequential_cpu_offload = isinstance(component._hf_hook, AlignDevicesHook)
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
remove_hook_from_module(component, recurse=True)
|
| 54 |
-
|
| 55 |
-
return (is_model_cpu_offload, is_sequential_cpu_offload)
|
| 56 |
-
|
| 57 |
-
def quantize_4bit(module):
|
| 58 |
-
for name, child in module.named_children():
|
| 59 |
-
if isinstance(child, torch.nn.Linear):
|
| 60 |
-
in_features = child.in_features
|
| 61 |
-
out_features = child.out_features
|
| 62 |
-
device = child.weight.data.device
|
| 63 |
-
|
| 64 |
-
# Create and configure the Linear layer
|
| 65 |
-
has_bias = True if child.bias is not None else False
|
| 66 |
-
|
| 67 |
-
# TODO: Make that configurable
|
| 68 |
-
# fp16 for compute dtype leads to faster inference
|
| 69 |
-
# and one should almost always use nf4 as a rule of thumb
|
| 70 |
-
bnb_4bit_compute_dtype = torch.float16
|
| 71 |
-
quant_type = "nf4"
|
| 72 |
-
|
| 73 |
-
new_layer = bnb.nn.Linear4bit(
|
| 74 |
-
in_features,
|
| 75 |
-
out_features,
|
| 76 |
-
bias=has_bias,
|
| 77 |
-
compute_dtype=bnb_4bit_compute_dtype,
|
| 78 |
-
quant_type=quant_type,
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
new_layer.load_state_dict(child.state_dict())
|
| 82 |
-
new_layer = new_layer.to(device)
|
| 83 |
-
|
| 84 |
-
# Set the attribute
|
| 85 |
-
setattr(module, name, new_layer)
|
| 86 |
-
else:
|
| 87 |
-
# Recursively apply to child modules
|
| 88 |
-
quantize_4bit(child)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils_mask.py
CHANGED
|
@@ -164,4 +164,4 @@ def get_mask_location(model_type, category, model_parse: Image.Image, keypoint:
|
|
| 164 |
mask = Image.fromarray(inpaint_mask.astype(np.uint8) * 255)
|
| 165 |
mask_gray = Image.fromarray(inpaint_mask.astype(np.uint8) * 127)
|
| 166 |
|
| 167 |
-
return mask, mask_gray
|
|
|
|
| 164 |
mask = Image.fromarray(inpaint_mask.astype(np.uint8) * 255)
|
| 165 |
mask_gray = Image.fromarray(inpaint_mask.astype(np.uint8) * 127)
|
| 166 |
|
| 167 |
+
return mask, mask_gray
|
vae/config.json
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_class_name": "AutoencoderKL",
|
| 3 |
-
"_diffusers_version": "0.21.0.dev0",
|
| 4 |
-
"_name_or_path": "madebyollin/sdxl-vae-fp16-fix",
|
| 5 |
-
"act_fn": "silu",
|
| 6 |
-
"block_out_channels": [
|
| 7 |
-
128,
|
| 8 |
-
256,
|
| 9 |
-
512,
|
| 10 |
-
512
|
| 11 |
-
],
|
| 12 |
-
"down_block_types": [
|
| 13 |
-
"DownEncoderBlock2D",
|
| 14 |
-
"DownEncoderBlock2D",
|
| 15 |
-
"DownEncoderBlock2D",
|
| 16 |
-
"DownEncoderBlock2D"
|
| 17 |
-
],
|
| 18 |
-
"force_upcast": false,
|
| 19 |
-
"in_channels": 3,
|
| 20 |
-
"latent_channels": 4,
|
| 21 |
-
"layers_per_block": 2,
|
| 22 |
-
"norm_num_groups": 32,
|
| 23 |
-
"out_channels": 3,
|
| 24 |
-
"sample_size": 512,
|
| 25 |
-
"scaling_factor": 0.13025,
|
| 26 |
-
"up_block_types": [
|
| 27 |
-
"UpDecoderBlock2D",
|
| 28 |
-
"UpDecoderBlock2D",
|
| 29 |
-
"UpDecoderBlock2D",
|
| 30 |
-
"UpDecoderBlock2D"
|
| 31 |
-
]
|
| 32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vae/diffusion_pytorch_model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:98a14dc6fe8d71c83576f135a87c61a16561c9c080abba418d2cc976ee034f88
|
| 3 |
-
size 334643268
|
|
|
|
|
|
|
|
|
|
|
|
vitonhd_test_tagged.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|