Instructions to use Overworld/Waypoint-1-Small with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use Overworld/Waypoint-1-Small with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("Overworld/Waypoint-1-Small", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| # Copyright (C) 2025 Hugging Face Team and Overworld | |
| # | |
| # This program is free software: you can redistribute it and/or modify | |
| # it under the terms of the GNU General Public License as published by | |
| # the Free Software Foundation, either version 3 of the License, or | |
| # (at your option) any later version. | |
| # | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # GNU General Public License for more details. | |
| # | |
| # You should have received a copy of the GNU General Public License | |
| # along with this program. If not, see <https://www.gnu.org/licenses/>. | |
| import torch | |
| from torch import nn | |
| import torch.nn.functional as F | |
| from torch.nn.utils.parametrizations import weight_norm | |
| from torch.nn.utils.parametrize import remove_parametrizations | |
| def bake_weight_norm(model: nn.Module) -> nn.Module: | |
| """Remove weight_norm parametrizations, baking normalized weights into regular tensors. | |
| This is required for torch.compile/CUDA graph compatibility since weight_norm | |
| performs in-place updates during forward passes. | |
| """ | |
| for module in model.modules(): | |
| if hasattr(module, "parametrizations") and "weight" in getattr(module, "parametrizations", {}): | |
| remove_parametrizations(module, "weight", leave_parametrized=True) | |
| return model | |
| # === General Blocks === | |
| def WeightNormConv2d(*args, **kwargs): | |
| return weight_norm(nn.Conv2d(*args, **kwargs)) | |
| class ResBlock(nn.Module): | |
| def __init__(self, ch): | |
| super().__init__() | |
| hidden = 2 * ch | |
| # 16 channels per group (matches checkpoint shapes like [128,16,3,3] when ch=64) | |
| n_grps = max(1, hidden // 16) | |
| self.conv1 = WeightNormConv2d(ch, hidden, 1, 1, 0) | |
| self.conv2 = WeightNormConv2d(hidden, hidden, 3, 1, 1, groups=n_grps) | |
| self.conv3 = WeightNormConv2d(hidden, ch, 1, 1, 0, bias=False) | |
| self.act1 = nn.LeakyReLU(inplace=False) | |
| self.act2 = nn.LeakyReLU(inplace=False) | |
| def forward(self, x): | |
| h = self.conv1(x) | |
| h = self.act1(h) | |
| h = self.conv2(h) | |
| h = self.act2(h) | |
| h = self.conv3(h) | |
| return x + h | |
| # === Encoder === | |
| class LandscapeToSquare(nn.Module): | |
| # Strict assumption of 360p | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = WeightNormConv2d(ch_in, ch_out, 3, 1, 1) | |
| def forward(self, x): | |
| x = F.interpolate(x, (512, 512), mode='bicubic') | |
| x = self.proj(x) | |
| return x | |
| class Downsample(nn.Module): | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = WeightNormConv2d(ch_in, ch_out, 1, 1, 0, bias=False) | |
| def forward(self, x): | |
| x = F.interpolate(x, scale_factor=0.5, mode='bicubic') | |
| x = self.proj(x) | |
| return x | |
| class DownBlock(nn.Module): | |
| def __init__(self, ch_in, ch_out, num_res=1): | |
| super().__init__() | |
| self.down = Downsample(ch_in, ch_out) | |
| blocks = [] | |
| for _ in range(num_res): | |
| blocks.append(ResBlock(ch_in)) | |
| self.blocks = nn.ModuleList(blocks) | |
| def forward(self, x): | |
| for block in self.blocks: | |
| x = block(x) | |
| x = self.down(x) | |
| return x | |
| class SpaceToChannel(nn.Module): | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = WeightNormConv2d(ch_in, ch_out // 4, 3, 1, 1) | |
| def forward(self, x): | |
| x = self.proj(x) | |
| x = F.pixel_unshuffle(x, 2).contiguous() | |
| return x | |
| class ChannelAverage(nn.Module): | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = WeightNormConv2d(ch_in, ch_out, 3, 1, 1) | |
| self.grps = ch_in // ch_out | |
| self.scale = (self.grps) ** 0.5 | |
| def forward(self, x): | |
| res = x | |
| x = self.proj(x.contiguous()) # [b, ch_out, h, w] | |
| # Residual goes through channel avg | |
| res = res.view(res.shape[0], self.grps, res.shape[1] // self.grps, res.shape[2], res.shape[3]).contiguous() | |
| res = res.mean(dim=1) * self.scale # [b, ch_out, h, w] | |
| return res + x | |
| # === Decoder === | |
| class SquareToLandscape(nn.Module): | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = WeightNormConv2d(ch_in, ch_out, 3, 1, 1) | |
| def forward(self, x): | |
| x = self.proj(x) # TODO This ordering is wrong for both | |
| x = F.interpolate(x, (360, 640), mode='bicubic') | |
| return x | |
| class Upsample(nn.Module): | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = nn.Identity() if ch_in == ch_out else WeightNormConv2d( | |
| ch_in, ch_out, 1, 1, 0, bias=False | |
| ) | |
| def forward(self, x): | |
| x = self.proj(x) | |
| x = F.interpolate(x, scale_factor=2.0, mode='bicubic') | |
| return x | |
| class UpBlock(nn.Module): | |
| def __init__(self, ch_in, ch_out, num_res=1): | |
| super().__init__() | |
| self.up = Upsample(ch_in, ch_out) | |
| blocks = [] | |
| for _ in range(num_res): | |
| blocks.append(ResBlock(ch_out)) | |
| self.blocks = nn.ModuleList(blocks) | |
| def forward(self, x): | |
| x = self.up(x) | |
| for block in self.blocks: | |
| x = block(x) | |
| return x | |
| class ChannelToSpace(nn.Module): | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = WeightNormConv2d(ch_in, ch_out * 4, 3, 1, 1) | |
| def forward(self, x): | |
| x = self.proj(x) | |
| x = F.pixel_shuffle(x, 2).contiguous() | |
| return x | |
| class ChannelDuplication(nn.Module): | |
| def __init__(self, ch_in, ch_out): | |
| super().__init__() | |
| self.proj = WeightNormConv2d(ch_in, ch_out, 3, 1, 1) | |
| self.reps = ch_out // ch_in | |
| self.scale = (self.reps) ** -0.5 | |
| def forward(self, x): | |
| res = x | |
| x = self.proj(x.contiguous()) | |
| b, c, h, w = res.shape | |
| res = res.unsqueeze(2) # [b, c, 1, h, w] | |
| res = res.expand(b, c, self.reps, h, w) # [b, c, reps, h, w] | |
| res = res.reshape(b, c * self.reps, h, w).contiguous() | |
| res = res * self.scale | |
| return res + x | |
| # === Main AE === | |
| class Encoder(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.conv_in = LandscapeToSquare(config.channels, config.ch_0) | |
| blocks = [] | |
| residuals = [] | |
| ch = config.ch_0 | |
| for block_count in config.encoder_blocks_per_stage: | |
| next_ch = min(ch*2, config.ch_max) | |
| blocks.append(DownBlock(ch, next_ch, block_count)) | |
| residuals.append(SpaceToChannel(ch, next_ch)) | |
| ch = next_ch | |
| self.blocks = nn.ModuleList(blocks) | |
| self.residuals = nn.ModuleList(residuals) | |
| self.conv_out = ChannelAverage(ch, config.latent_channels) | |
| self.skip_logvar = bool(getattr(config, "skip_logvar", False)) | |
| if not self.skip_logvar: | |
| # Checkpoint expects a 1-channel logvar head: [1, ch, 3, 3] | |
| self.conv_out_logvar = WeightNormConv2d(ch, 1, 3, 1, 1) | |
| def forward(self, x): | |
| x = self.conv_in(x) | |
| for block, residual in zip(self.blocks, self.residuals): | |
| x = block(x) + residual(x) | |
| return self.conv_out(x) | |
| class Decoder(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.conv_in = ChannelDuplication(config.latent_channels, config.ch_max) | |
| blocks = [] | |
| residuals = [] | |
| ch = config.ch_0 | |
| for block_count in reversed(config.decoder_blocks_per_stage): | |
| next_ch = min(ch*2, config.ch_max) | |
| blocks.append(UpBlock(next_ch, ch, block_count)) | |
| residuals.append(ChannelToSpace(next_ch, ch)) | |
| ch = next_ch | |
| self.blocks = nn.ModuleList(reversed(blocks)) | |
| self.residuals = nn.ModuleList(reversed(residuals)) | |
| self.act_out = nn.SiLU() | |
| self.conv_out = SquareToLandscape(config.ch_0, config.channels) | |
| def forward(self, x): | |
| x = self.conv_in(x) | |
| for block, residual in zip(self.blocks, self.residuals): | |
| x = block(x) + residual(x) | |
| x = self.act_out(x) | |
| return self.conv_out(x) | |