Skywork
/

UniPic2-SD3.5M-Kontext-2B

@@ -69,13 +69,56 @@ pip install -r requirements.txt
 ### 3.Text-to-Image Generation
 ```bash
-export PYTHONPATH=./:$PYTHONPATH
-python scripts/text2image.py configs/models/qwen2_5_1_5b_kl16_mar_h.py \
-  --checkpoint checkpoint/pytorch_model.bin \
-  --image_size 1024 \
-  --prompt "A glossy-coated golden retriever stands on the park lawn beside a life-sized penguin statue." \
-  --output output.jpg
 ```
@@ -83,14 +126,37 @@ python scripts/text2image.py configs/models/qwen2_5_1_5b_kl16_mar_h.py \
 The image editing feature within this unified model is an exploratory module at the forefront of research. And it is not yet production-ready.
 ```bash
-export PYTHONPATH=./:$PYTHONPATH
-python scripts/image_edit.py configs/models/qwen2_5_1_5b_kl16_mar_h.py \
-  --checkpoint checkpoint/pytorch_model.bin \
-  --image_size 1024 \
-  --image data/sample.png \
-  --prompt "Replace the stars with the candle." \
-  --output output.jpg
 ```
 ## 📄 License

 ### 3.Text-to-Image Generation
 ```bash
+import torch
+from PIL import Image
+from unipicv2.pipeline_stable_diffusion_3_kontext import StableDiffusion3KontextPipeline
+from unipicv2.transformer_sd3_kontext import SD3Transformer2DKontextModel
+from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+# Load model components
+pretrained_model_name_or_path = "/path/to/unipicv2_sd_3_5m_kontext"
+transformer = SD3Transformer2DKontextModel.from_pretrained(
+    pretrained_model_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16).cuda()
+vae = AutoencoderKL.from_pretrained(
+    pretrained_model_name_or_path, subfolder="vae", torch_dtype=torch.bfloat16).cuda()
+# Load text encoders
+text_encoder = CLIPTextModelWithProjection.from_pretrained(
+    pretrained_model_name_or_path, subfolder="text_encoder", torch_dtype=torch.bfloat16).cuda()
+tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
+text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
+    pretrained_model_name_or_path, subfolder="text_encoder_2", torch_dtype=torch.bfloat16).cuda()
+tokenizer_2 = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer_2")
+text_encoder_3 = T5EncoderModel.from_pretrained(
+    pretrained_model_name_or_path, subfolder="text_encoder_3", torch_dtype=torch.bfloat16).cuda()
+tokenizer_3 = T5TokenizerFast.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer_3")
+scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
+# Create pipeline
+pipeline = StableDiffusion3KontextPipeline(
+    transformer=transformer, vae=vae,
+    text_encoder=text_encoder, tokenizer=tokenizer,
+    text_encoder_2=text_encoder_2, tokenizer_2=tokenizer_2,
+    text_encoder_3=text_encoder_3, tokenizer_3=tokenizer_3,
+    scheduler=scheduler)
+# Generate image
+image = pipeline(
+    prompt='a pig with wings and a top hat flying over a happy futuristic scifi city',
+    negative_prompt='',
+    height=512, width=384,
+    num_inference_steps=50,
+    guidance_scale=3.5,
+    generator=torch.Generator(device=transformer.device).manual_seed(42)
+).images[0]
+image.save("text2image.png")
 ```
 The image editing feature within this unified model is an exploratory module at the forefront of research. And it is not yet production-ready.
 ```bash
+# Load and preprocess image
+def fix_longer_edge(x, image_size, factor=32):
+    w, h = x.size
+    if w >= h:
+        target_w = image_size
+        target_h = h * (target_w / w)
+        target_h = round(target_h / factor) * factor
+    else:
+        target_h = image_size
+        target_w = w * (target_h / h)
+        target_w = round(target_w / factor) * factor
+    x = x.resize(size=(target_w, target_h))
+    return x
+image = Image.open("text2image.png")
+image = fix_longer_edge(image, image_size=512)
+negative_prompt = "blurry, low quality, low resolution, distorted, deformed, broken content, missing parts, damaged details, artifacts, glitch, noise, pixelated, grainy, compression artifacts, bad composition, wrong proportion, incomplete editing, unfinished, unedited areas."
+# Edit image
+edited_image = pipeline(
+    image=image,
+    prompt="remove the pig's hat",
+    negative_prompt=negative_prompt,
+    height=image.height, width=image.width,
+    num_inference_steps=50,
+    guidance_scale=3.5,
+    generator=torch.Generator(device=transformer.device).manual_seed(42)
+).images[0]
+edited_image.save("image_editing.png")
 ```
 ## 📄 License