Yiming-M commited on Jul 31

Commit

0ecb9aa

verified ·

1 Parent(s): ffd9437

2025-07-31 15:53 🐣

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +75 -3
configs/bin_config.json +50 -0
configs/nwpu.yaml +34 -0
configs/qnrf.yaml +33 -0
configs/sha.yaml +33 -0
configs/shb.yaml +33 -0
count.py +253 -0
count.sh +5 -0
counts/jhu.json +425 -0
counts/jhu_max.json +74 -0
counts/nwpu.json +761 -0
counts/nwpu_max.json +74 -0
counts/qnrf.json +569 -0
counts/qnrf_max.json +74 -0
counts/sha.json +578 -0
counts/sha_max.json +74 -0
counts/shb.json +313 -0
counts/shb_max.json +74 -0
datasets/__init__.py +12 -0
datasets/crowd.py +309 -0
datasets/transforms.py +262 -0
datasets/utils.py +63 -0
efficiency.py +163 -0
evaluate.py +84 -0
losses/__init__.py +7 -0
losses/bregman_pytorch.py +70 -0
losses/dm_loss.py +142 -0
losses/dual_loss.py +175 -0
losses/loss.py +204 -0
losses/multiscale_mae.py +55 -0
losses/poisson_nll.py +46 -0
losses/utils.py +19 -0
losses/zero_inflated_poisson_nll.py +96 -0
models/__init__.py +155 -0
models/clip_ebc/__init__.py +7 -0
models/clip_ebc/__pycache__/__init__.cpython-312.pyc +0 -0
models/clip_ebc/__pycache__/convnext.cpython-312.pyc +0 -0
models/clip_ebc/__pycache__/mobileclip.cpython-312.pyc +0 -0
models/clip_ebc/__pycache__/model.cpython-312.pyc +0 -0
models/clip_ebc/__pycache__/resnet.cpython-312.pyc +0 -0
models/clip_ebc/__pycache__/utils.cpython-312.pyc +0 -0
models/clip_ebc/__pycache__/vit.cpython-312.pyc +0 -0
models/clip_ebc/__pycache__/vit_siglip.cpython-312.pyc +0 -0
models/clip_ebc/convnext.py +199 -0
models/clip_ebc/mobileclip.py +197 -0
models/clip_ebc/model.py +272 -0
models/clip_ebc/resnet.py +236 -0
models/clip_ebc/utils.py +137 -0
models/clip_ebc/vit.py +372 -0
models/ebc/__init__.py +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,75 @@
----
-license: mit
----

+# EBC-ZIP
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/ebc-zip-improving-blockwise-crowd-counting/crowd-counting-on-shanghaitech-a)](https://paperswithcode.com/sota/crowd-counting-on-shanghaitech-a?p=ebc-zip-improving-blockwise-crowd-counting)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/ebc-zip-improving-blockwise-crowd-counting/crowd-counting-on-shanghaitech-b)](https://paperswithcode.com/sota/crowd-counting-on-shanghaitech-b?p=ebc-zip-improving-blockwise-crowd-counting)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/ebc-zip-improving-blockwise-crowd-counting/crowd-counting-on-ucf-qnrf)](https://paperswithcode.com/sota/crowd-counting-on-ucf-qnrf?p=ebc-zip-improving-blockwise-crowd-counting)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/ebc-zip-improving-blockwise-crowd-counting/crowd-counting-on-nwpu-crowd-val)](https://paperswithcode.com/sota/crowd-counting-on-nwpu-crowd-val?p=ebc-zip-improving-blockwise-crowd-counting)
+The official implementation of the paper [*ZIP: Scalable Crowd Counting via Zero-Inflated Poisson Modeling*](https://arxiv.org/pdf/2506.19955).
+## Reults
+| **Variants** | **Size (M)** | **GFLOPS (on HD)** | **SHA (MAE)** | **SHA (RMSE)** | **SHA (NAE, %)** | **SHB (MAE)** | **SHB (RMSE)** | **SHB (NAE, %)** | **QNRF (MAE)** | **QNRF (RMSE)** | **QNRF (NAE, %)** |
+|--------------|--------------|--------------------|---------------|----------------|------------------|---------------|----------------|------------------|----------------|-----------------|-------------------|
+| -P (Pico)    | 0.81         | 6.46               | 71.18         | 109.60         | 16.69            | 8.23          | 12.62          | 6.98             | 96.29          | 161.82          | 14.40             |
+| -N (Nano)    | 3.36         | 24.73              | 58.86         | 94.63          | 14.15            | 7.74          | 12.14          | 6.33             | 86.46          | 147.64           | 12.60             |
+| -T (Tiny)    | 10.53        | 61.39              | 56.36         | 86.09          | 13.26            | 6.67          | 9.90           | 5.52             | 76.02          | 129.40          | 11.10             |
+| -S (Small)   | 33.60        | 242.43             | 55.17         | 88.99          | 11.97            | 5.83          | 9.21           | 4.58             | 73.32          | 125.09          | 10.40             |
+| -B (Base)    | 105.60       | 800.99             | 47.81         | 75.04          | 11.06            | 5.51          | 8.63           | 4.48             | 69.46          | 121.88          | 10.18             |
+## Step 1: Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+## Step 2: Download Processed Datasets
+- **ShanghaiTech A**: [sha.zip](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/sha.zip)
+- **ShanghaiTech B**: [shb.zip](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/shb.zip)
+- **UCF-QNRF**: [qnrf.zip](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/qnrf.zip), [qnrf.z01](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/qnrf.z01)
+- **NWPU-Crowd**: [nwpu.zip](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.zip), [nwpu.z01](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z01), [nwpu.z02](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z02), [nwpu.z03](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z03), [nwpu.z04](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z04), [nwpu.z05](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z05), [nwpu.z06](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z06), [nwpu.z07](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z07), [nwpu.z08](https://github.com/Yiming-M/EBC-ZIP/releases/download/dataset/nwpu.z08)
+To unzip splitted `.zip` files, 7-Zip is recommended. You can use the following command to install 7-Zip and unzip the dataset:
+```bash
+sudo apt update
+sudo apt install p7zip-full
+7z x dataset.zip
+```
+## Step 3: Run Training
+Add the training code to `run.sh` and execute it:
+```bash
+sh run.sh
+```
+If you want to use the zero-inflated loss, set either `--reg_loss` or `--aux_loss` to `zipnll`. For example, you can set `--reg_loss zipnll` to use the zero-inflated loss for regression.
+You can use an auxillary loss to improve the performance. For example, you might want to use the pre-defined multi-scale MAE loss by setting `--aux_loss msmae` and `--scales 1 2 4`.
+The DMCount loss can also be used together with the zero-inflated loss. For example, you can set `--reg_loss zipnll --aux_loss dmcount` to use both losses.
+## Step 4: Test the Model
+Use `test.py` or `test.sh` to test the model. You can specify the dataset, weight path, input size, and other parameters.
+To generate the predicted counts on NWPU-Crowd Test, you need to use `test_nwpu.py` instead.
+To visualize the results, use the `notebooks/model.ipynb` notebook.
+Trained weights are also provided:
+- [**ShanghaiTech A**](https://github.com/Yiming-M/EBC-ZIP/releases/tag/weights_sha)
+- [**ShanghaiTech B**](https://github.com/Yiming-M/EBC-ZIP/releases/tag/weights_shb)
+- [**UCF-QNRF**](https://github.com/Yiming-M/EBC-ZIP/releases/tag/weights_qnrf)
+- [**NWPU-Crowd**](https://github.com/Yiming-M/EBC-ZIP/releases/tag/weights_nwpu)
+Make sure to use the processed datasets and the exact commands pre-defined in `test.sh` to reproduce the same results.
+## Step 5: Visualize the Results
+Use the `notebooks/model.ipynb` notebook to visualize the results.

configs/bin_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+    "shb": {
+        "8": [[0, 0], [1, 1], [2, 2], [3, 3], [4, "inf"]],
+        "16": [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, "inf"]],
+        "32": [
+            [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9],
+            [10, 10], [11, 11], [12, 12], [13, 13], [14, 14],
+            [15, 16], [17, 18], [19, 20],
+            [21, 23], [24, "inf"]
+        ]
+    },
+    "sha": {
+        "8": [[0, 0], [1, 1], [2, 2], [3, 3], [4, "inf"]],
+        "16": [
+            [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9],
+            [10, 10], [11, 12], [13, 14], [15, "inf"]
+        ],
+        "32": [
+            [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9],
+            [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19],
+            [20, 21], [22, 23], [24, 25], [26, 27], [28, 29],
+            [30, 32], [33, 35], [36, 38], [39, 41],
+            [42, 45], [46, "inf"]
+        ]
+    },
+    "qnrf": {
+        "8": [[0, 0], [1, 1], [2, 2], [3, 3], [4, "inf"]],
+        "16": [
+            [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9],
+            [10, 10], [11, 12], [13, "inf"]
+        ],
+        "32": [
+            [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9],
+            [10, 10], [11, 12], [13, 14], [15, 16], [17, 18], [19, 20],
+            [21, 23], [24, 26], [27, 29], [30, 33], [34, "inf"]
+        ]
+    },
+    "nwpu": {
+        "8": [[0, 0], [1, 1], [2, 2], [3, 3], [4, "inf"]],
+        "16": [
+            [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9],
+            [10, "inf"]
+        ],
+        "32": [
+            [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9],
+            [10, 11], [12, 13], [14, 15], [16, 17], [18, 19],
+            [20, 22], [23, 25], [26, 28], [29, "inf"]
+        ]
+    }
+}

configs/nwpu.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+metadata:
+  name: NWPU
+  description: Training configuration on the NWPU dataset.
+input_size: 672
+block_size: 16
+batch_size: 8
+num_crops: 1
+aug_min_scale: 0.75
+aug_max_scale: 2.0
+aug_brightness: 0.2
+aug_contrast: 0.2
+aug_saturation: 0.15
+aug_hue: 0.0
+aug_kernel_size: 5
+aug_blur_prob: 0.2
+aug_saltiness: 0.001
+aug_spiciness: 0.001
+lr: 0.0001
+vpt_lr: 0.0001
+adapter_lr: 0.0001
+lora_lr: 0.0001
+backbone_lr: 0.0001
+weight_decay: 0.0001
+vpt_weight_decay: 0.0001
+adapter_weight_decay: 0.0001
+lora_weight_decay: 0.0001
+backbone_weight_decay: 0.0001
+eval_freq: 1.0
+eval_start: 100

configs/qnrf.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+metadata:
+  name: qnrf
+  description: Training configuration on the UCF-QNRF dataset.
+input_size: 672
+block_size: 32
+batch_size: 8
+num_crops: 1
+aug_min_scale: 0.75
+aug_max_scale: 2.0
+aug_brightness: 0.15
+aug_contrast: 0.15
+aug_saturation: 0.1
+aug_hue: 0.0
+aug_blur_prob: 0.0
+aug_saltiness: 0.001
+aug_spiciness: 0.001
+lr: 0.0001
+vpt_lr: 0.0001
+adapter_lr: 0.0001
+lora_lr: 0.0001
+backbone_lr: 0.0001
+weight_decay: 0.0001
+vpt_weight_decay: 0.0001
+adapter_weight_decay: 0.0001
+lora_weight_decay: 0.0001
+backbone_weight_decay: 0.0001
+eval_freq: 0.5
+eval_start: 150

configs/sha.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+metadata:
+  name: sha
+  description: Training configuration on the ShanghaiTech A dataset.
+input_size: 448
+block_size: 16
+batch_size: 8
+num_crops: 1
+aug_min_scale: 0.75
+aug_max_scale: 2.0
+aug_brightness: 0.15
+aug_contrast: 0.15
+aug_saturation: 0.1
+aug_hue: 0.0
+aug_blur_prob: 0.0
+aug_saltiness: 0.001
+aug_spiciness: 0.001
+lr: 0.0001
+vpt_lr: 0.0001
+adapter_lr: 0.0001
+lora_lr: 0.0001
+backbone_lr: 0.0001
+weight_decay: 0.0001
+vpt_weight_decay: 0.0001
+adapter_weight_decay: 0.0001
+lora_weight_decay: 0.0001
+backbone_weight_decay: 0.0001
+eval_freq: 0.25
+eval_start: 100

configs/shb.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+metadata:
+  name: shb
+  description: Training configuration on the ShanghaiTech B dataset.
+input_size: 448
+block_size: 16
+batch_size: 8
+num_crops: 1
+aug_min_scale: 0.75
+aug_max_scale: 2.5
+aug_brightness: 0.15
+aug_contrast: 0.15
+aug_saturation: 0.1
+aug_hue: 0.0
+aug_blur_prob: 0.0
+aug_saltiness: 0.001
+aug_spiciness: 0.001
+lr: 0.0001
+vpt_lr: 0.0001
+adapter_lr: 0.0001
+lora_lr: 0.0001
+backbone_lr: 0.0001
+weight_decay: 0.0001
+vpt_weight_decay: 0.0001
+adapter_weight_decay: 0.0001
+lora_weight_decay: 0.0001
+backbone_weight_decay: 0.0001
+eval_freq: 0.25
+eval_start: 150

count.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import torch
+from torch import nn
+import numpy as np
+import os, json
+from tqdm import tqdm
+from argparse import ArgumentParser
+from typing import Dict
+import datasets
+class SumPool2d(nn.Module):
+    def __init__(self, kernel_size: int, stride: int):
+        super(SumPool2d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.sum_pool = nn.AvgPool2d(kernel_size, stride, divisor_override=1)
+    def forward(self, x):
+        return self.sum_pool(x)
+def _update_dict(d: Dict, keys: np.ndarray, values: np.ndarray) -> Dict:
+    keys = keys.tolist() if isinstance(keys, np.ndarray) else keys
+    values = values.tolist() if isinstance(values, np.ndarray) else values
+    for k, v in zip(keys, values):
+        d[k] = d.get(k, 0) + v
+    return d
+def _get_counts(
+    dataset_name: str,
+    device: torch.device,
+) -> None:
+    filter_4 = SumPool2d(4, 1).to(device)
+    filter_7 = SumPool2d(7, 1).to(device)
+    filter_8 = SumPool2d(8, 1).to(device)
+    filter_14 = SumPool2d(14, 1).to(device)
+    filter_16 = SumPool2d(16, 1).to(device)
+    filter_28 = SumPool2d(28, 1).to(device)
+    filter_32 = SumPool2d(32, 1).to(device)
+    filter_56 = SumPool2d(56, 1).to(device)
+    filter_64 = SumPool2d(64, 1).to(device)
+    counts_1, counts_4, counts_7, counts_8 = {}, {}, {}, {}
+    counts_14, counts_16 = {}, {}
+    counts_28, counts_32 = {}, {}
+    counts_56, counts_64 = {}, {}
+    max_counts_4 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_7 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_8 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_14 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_16 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_28 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_32 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_56 = {"max": 0., "name": None, "x": None, "y": None}
+    max_counts_64 = {"max": 0., "name": None, "x": None, "y": None}
+    counts_dir = os.path.join(os.getcwd(), "counts")
+    os.makedirs(counts_dir, exist_ok=True)
+    dataset = datasets.Crowd(dataset=dataset_name, split="train", transforms=None, return_filename=True)
+    print(f"Counting {dataset_name} dataset")
+    for i in tqdm(range(len(dataset))):
+        _, _, density, img_name = dataset[i]
+        density_np = density.cpu().numpy().astype(int)
+        uniques_, counts_ = np.unique(density_np, return_counts=True)
+        counts_1 = _update_dict(counts_1, uniques_, counts_)
+        density = density.to(device)  # Add batch dimension
+        window_4, window_7, window_8 = filter_4(density), filter_7(density), filter_8(density)
+        window_14, window_16 = filter_14(density), filter_16(density)
+        window_28, window_32 = filter_28(density), filter_32(density)
+        window_56, window_64 = filter_56(density), filter_64(density)
+        window_4, window_7, window_8 = torch.round(window_4).int(), torch.round(window_7).int(), torch.round(window_8).int()
+        window_14, window_16 = torch.round(window_14).int(), torch.round(window_16).int()
+        window_28, window_32 = torch.round(window_28).int(), torch.round(window_32).int()
+        window_56, window_64 = torch.round(window_56).int(), torch.round(window_64).int()
+        window_4, window_7, window_8 = torch.squeeze(window_4), torch.squeeze(window_7), torch.squeeze(window_8)
+        window_14, window_16 = torch.squeeze(window_14), torch.squeeze(window_16)
+        window_28, window_32 = torch.squeeze(window_28), torch.squeeze(window_32)
+        window_56, window_64 = torch.squeeze(window_56), torch.squeeze(window_64)
+        if window_4.max().item() > max_counts_4["max"]:
+            max_counts_4["max"] = window_4.max().item()
+            max_counts_4["name"] = img_name
+            x, y = torch.where(window_4 == window_4.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_4["x"] = x
+            max_counts_4["y"] = y
+        if window_7.max().item() > max_counts_7["max"]:
+            max_counts_7["max"] = window_7.max().item()
+            max_counts_7["name"] = img_name
+            x, y = torch.where(window_7 == window_7.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_7["x"] = x
+            max_counts_7["y"] = y
+        if window_8.max().item() > max_counts_8["max"]:
+            max_counts_8["max"] = window_8.max().item()
+            max_counts_8["name"] = img_name
+            x, y = torch.where(window_8 == window_8.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_8["x"] = x
+            max_counts_8["y"] = y
+        if window_14.max().item() > max_counts_14["max"]:
+            max_counts_14["max"] = window_14.max().item()
+            max_counts_14["name"] = img_name
+            x, y = torch.where(window_14 == window_14.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_14["x"] = x
+            max_counts_14["y"] = y
+        if window_16.max().item() > max_counts_16["max"]:
+            max_counts_16["max"] = window_16.max().item()
+            max_counts_16["name"] = img_name
+            x, y = torch.where(window_16 == window_16.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_16["x"] = x
+            max_counts_16["y"] = y
+        if window_28.max().item() > max_counts_28["max"]:
+            max_counts_28["max"] = window_28.max().item()
+            max_counts_28["name"] = img_name
+            x, y = torch.where(window_28 == window_28.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_28["x"] = x
+            max_counts_28["y"] = y
+        if window_32.max().item() > max_counts_32["max"]:
+            max_counts_32["max"] = window_32.max().item()
+            max_counts_32["name"] = img_name
+            x, y = torch.where(window_32 == window_32.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_32["x"] = x
+            max_counts_32["y"] = y
+        if window_56.max().item() > max_counts_56["max"]:
+            max_counts_56["max"] = window_56.max().item()
+            max_counts_56["name"] = img_name
+            x, y = torch.where(window_56 == window_56.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_56["x"] = x
+            max_counts_56["y"] = y
+        if window_64.max().item() > max_counts_64["max"]:
+            max_counts_64["max"] = window_64.max().item()
+            max_counts_64["name"] = img_name
+            x, y = torch.where(window_64 == window_64.max())
+            x, y = x[0].item(), y[0].item()
+            max_counts_64["x"] = x
+            max_counts_64["y"] = y
+        window_4 = window_4.view(-1).cpu().numpy().astype(int)
+        window_7 = window_7.view(-1).cpu().numpy().astype(int)
+        window_8 = window_8.view(-1).cpu().numpy().astype(int)
+        window_14 = window_14.view(-1).cpu().numpy().astype(int)
+        window_16 = window_16.view(-1).cpu().numpy().astype(int)
+        window_28 = window_28.view(-1).cpu().numpy().astype(int)
+        window_32 = window_32.view(-1).cpu().numpy().astype(int)
+        window_56 = window_56.view(-1).cpu().numpy().astype(int)
+        window_64 = window_64.view(-1).cpu().numpy().astype(int)
+        #.view(-1).cpu().numpy().astype(int)
+        uniques_, counts_ = np.unique(window_4, return_counts=True)
+        counts_4 = _update_dict(counts_4, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_7, return_counts=True)
+        counts_7 = _update_dict(counts_7, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_8, return_counts=True)
+        counts_8 = _update_dict(counts_8, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_14, return_counts=True)
+        counts_14 = _update_dict(counts_14, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_16, return_counts=True)
+        counts_16 = _update_dict(counts_16, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_28, return_counts=True)
+        counts_28 = _update_dict(counts_28, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_32, return_counts=True)
+        counts_32 = _update_dict(counts_32, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_56, return_counts=True)
+        counts_56 = _update_dict(counts_56, uniques_, counts_)
+        uniques_, counts_ = np.unique(window_64, return_counts=True)
+        counts_64 = _update_dict(counts_64, uniques_, counts_)
+    counts = {
+        1: counts_1,
+        4: counts_4,
+        7: counts_7,
+        8: counts_8,
+        14: counts_14,
+        16: counts_16,
+        28: counts_28,
+        32: counts_32,
+        56: counts_56,
+        64: counts_64
+    }
+    max_counts = {
+        4: max_counts_4,
+        7: max_counts_7,
+        8: max_counts_8,
+        14: max_counts_14,
+        16: max_counts_16,
+        28: max_counts_28,
+        32: max_counts_32,
+        56: max_counts_56,
+        64: max_counts_64
+    }
+    with open(os.path.join(counts_dir, f"{dataset_name}.json"), "w") as f:
+        json.dump(counts, f)
+    with open(os.path.join(counts_dir, f"{dataset_name}_max.json"), "w") as f:
+        json.dump(max_counts, f)
+def parse_args():
+    parser = ArgumentParser(description="Get local counts of the dataset")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        choices=["nwpu", "ucf_qnrf", "shanghaitech_a", "shanghaitech_b"],
+        required=True,
+        help="The dataset to use."
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="The device to use."
+    )
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    args.dataset = datasets.standardize_dataset_name(args.dataset)
+    args.device = torch.device(args.device)
+    _get_counts(args.dataset, args.device)

count.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/sh
+python count.py --dataset shanghaitech_a --device cuda:0
+python count.py --dataset shanghaitech_b --device cuda:0
+python count.py --dataset nwpu           --device cuda:0
+python count.py --dataset ucf_qnrf       --device cuda:0

counts/jhu.json ADDED Viewed

	@@ -0,0 +1,425 @@

+{
+    "1": {
+        "0": 5442129077,
+        "1": 844619
+    },
+    "4": {
+        "0": 5411259934,
+        "1": 13337323,
+        "2": 75154,
+        "3": 1725,
+        "4": 40
+    },
+    "7": {
+        "0": 5366145063,
+        "1": 39388535,
+        "2": 807008,
+        "3": 68635,
+        "4": 5975,
+        "5": 318,
+        "6": 17,
+        "7": 1
+    },
+    "8": {
+        "0": 5348298656,
+        "1": 50463806,
+        "2": 1400221,
+        "3": 154835,
+        "4": 19051,
+        "5": 1731,
+        "6": 121,
+        "7": 11
+    },
+    "14": {
+        "0": 5220148724,
+        "1": 129080801,
+        "2": 11196548,
+        "3": 2346703,
+        "4": 762426,
+        "5": 281109,
+        "6": 104707,
+        "7": 35659,
+        "8": 10533,
+        "9": 2989,
+        "10": 724,
+        "11": 196,
+        "12": 16,
+        "13": 1
+    },
+    "16": {
+        "0": 5172190839,
+        "1": 156244565,
+        "2": 17047061,
+        "3": 3987628,
+        "4": 1373739,
+        "5": 580316,
+        "6": 265393,
+        "7": 117895,
+        "8": 48278,
+        "9": 18825,
+        "10": 6835,
+        "11": 2535,
+        "12": 909,
+        "13": 209,
+        "14": 27,
+        "15": 2
+    },
+    "28": {
+        "0": 4868806093,
+        "1": 296210451,
+        "2": 64607415,
+        "3": 23796771,
+        "4": 11220229,
+        "5": 5869184,
+        "6": 3249319,
+        "7": 1854162,
+        "8": 1153843,
+        "9": 778472,
+        "10": 561910,
+        "11": 425259,
+        "12": 332715,
+        "13": 255032,
+        "14": 191332,
+        "15": 137704,
+        "16": 95475,
+        "17": 64842,
+        "18": 43528,
+        "19": 29738,
+        "20": 20028,
+        "21": 13687,
+        "22": 9609,
+        "23": 7228,
+        "24": 4847,
+        "25": 3457,
+        "26": 2563,
+        "27": 1831,
+        "28": 1349,
+        "29": 917,
+        "30": 589,
+        "31": 360,
+        "32": 213,
+        "33": 94,
+        "34": 22,
+        "35": 4
+    },
+    "32": {
+        "0": 4768229484,
+        "1": 332242168,
+        "2": 81810540,
+        "3": 32189657,
+        "4": 16022983,
+        "5": 8984314,
+        "6": 5419164,
+        "7": 3339453,
+        "8": 2097270,
+        "9": 1359271,
+        "10": 927341,
+        "11": 673849,
+        "12": 519302,
+        "13": 413081,
+        "14": 339682,
+        "15": 282493,
+        "16": 235154,
+        "17": 189365,
+        "18": 147778,
+        "19": 111779,
+        "20": 83938,
+        "21": 61440,
+        "22": 44843,
+        "23": 32312,
+        "24": 23514,
+        "25": 17003,
+        "26": 12718,
+        "27": 9671,
+        "28": 7115,
+        "29": 5853,
+        "30": 4515,
+        "31": 3342,
+        "32": 2525,
+        "33": 1880,
+        "34": 1522,
+        "35": 1199,
+        "36": 1034,
+        "37": 733,
+        "38": 561,
+        "39": 400,
+        "40": 287,
+        "41": 134,
+        "42": 62,
+        "43": 19,
+        "44": 4
+    },
+    "56": {
+        "0": 4222181888,
+        "1": 453337627,
+        "2": 170668322,
+        "3": 85503361,
+        "4": 50077828,
+        "5": 32125898,
+        "6": 22063372,
+        "7": 15687182,
+        "8": 11585957,
+        "9": 8807535,
+        "10": 6902417,
+        "11": 5494688,
+        "12": 4464497,
+        "13": 3672794,
+        "14": 3059884,
+        "15": 2569337,
+        "16": 2181015,
+        "17": 1848256,
+        "18": 1568914,
+        "19": 1327646,
+        "20": 1110617,
+        "21": 923381,
+        "22": 763225,
+        "23": 634769,
+        "24": 533036,
+        "25": 446198,
+        "26": 375536,
+        "27": 319752,
+        "28": 277970,
+        "29": 246034,
+        "30": 221081,
+        "31": 200820,
+        "32": 185527,
+        "33": 172457,
+        "34": 163190,
+        "35": 155461,
+        "36": 149548,
+        "37": 144236,
+        "38": 139882,
+        "39": 134703,
+        "40": 129346,
+        "41": 123503,
+        "42": 117688,
+        "43": 109973,
+        "44": 101970,
+        "45": 94300,
+        "46": 87095,
+        "47": 80710,
+        "48": 73843,
+        "49": 66773,
+        "50": 61099,
+        "51": 55590,
+        "52": 48984,
+        "53": 43741,
+        "54": 38838,
+        "55": 34038,
+        "56": 30826,
+        "57": 28088,
+        "58": 25668,
+        "59": 23430,
+        "60": 21750,
+        "61": 18902,
+        "62": 16508,
+        "63": 14272,
+        "64": 12549,
+        "65": 10596,
+        "66": 9228,
+        "67": 8081,
+        "68": 7185,
+        "69": 6284,
+        "70": 5698,
+        "71": 5124,
+        "72": 4488,
+        "73": 3761,
+        "74": 3171,
+        "75": 2908,
+        "76": 2554,
+        "77": 2211,
+        "78": 1956,
+        "79": 1784,
+        "80": 1529,
+        "81": 1317,
+        "82": 1189,
+        "83": 1136,
+        "84": 1086,
+        "85": 1012,
+        "86": 890,
+        "87": 914,
+        "88": 895,
+        "89": 832,
+        "90": 698,
+        "91": 607,
+        "92": 546,
+        "93": 526,
+        "94": 411,
+        "95": 386,
+        "96": 372,
+        "97": 415,
+        "98": 428,
+        "99": 487,
+        "100": 506,
+        "101": 549,
+        "102": 453,
+        "103": 475,
+        "104": 432,
+        "105": 391,
+        "106": 349,
+        "107": 307,
+        "108": 236,
+        "109": 183,
+        "110": 162,
+        "111": 128,
+        "112": 97,
+        "113": 48,
+        "114": 34,
+        "115": 14,
+        "116": 10,
+        "117": 7,
+        "118": 3,
+        "119": 1,
+        "120": 1
+    },
+    "64": {
+        "0": 4064136120,
+        "1": 469518405,
+        "2": 190549696,
+        "3": 101410734,
+        "4": 61441010,
+        "5": 40341860,
+        "6": 28363124,
+        "7": 20699526,
+        "8": 15647286,
+        "9": 12025617,
+        "10": 9421729,
+        "11": 7602900,
+        "12": 6244037,
+        "13": 5183786,
+        "14": 4355369,
+        "15": 3680829,
+        "16": 3145664,
+        "17": 2707446,
+        "18": 2348723,
+        "19": 2053730,
+        "20": 1802355,
+        "21": 1584446,
+        "22": 1402996,
+        "23": 1243258,
+        "24": 1087095,
+        "25": 947714,
+        "26": 818905,
+        "27": 707951,
+        "28": 615285,
+        "29": 531101,
+        "30": 459448,
+        "31": 397639,
+        "32": 343028,
+        "33": 295704,
+        "34": 259036,
+        "35": 229935,
+        "36": 207856,
+        "37": 189177,
+        "38": 173617,
+        "39": 158969,
+        "40": 147768,
+        "41": 139725,
+        "42": 132730,
+        "43": 127226,
+        "44": 122630,
+        "45": 118232,
+        "46": 115769,
+        "47": 114576,
+        "48": 111942,
+        "49": 107720,
+        "50": 105347,
+        "51": 101643,
+        "52": 98838,
+        "53": 96240,
+        "54": 91117,
+        "55": 87247,
+        "56": 82358,
+        "57": 77480,
+        "58": 72990,
+        "59": 68837,
+        "60": 65050,
+        "61": 61515,
+        "62": 57758,
+        "63": 53659,
+        "64": 50371,
+        "65": 45903,
+        "66": 42190,
+        "67": 39241,
+        "68": 35555,
+        "69": 32655,
+        "70": 29239,
+        "71": 26825,
+        "72": 24122,
+        "73": 22333,
+        "74": 21327,
+        "75": 19766,
+        "76": 18539,
+        "77": 16797,
+        "78": 15217,
+        "79": 13961,
+        "80": 12377,
+        "81": 11299,
+        "82": 9960,
+        "83": 8982,
+        "84": 7921,
+        "85": 7244,
+        "86": 6267,
+        "87": 5707,
+        "88": 5185,
+        "89": 4541,
+        "90": 4292,
+        "91": 3572,
+        "92": 3041,
+        "93": 2757,
+        "94": 2416,
+        "95": 2182,
+        "96": 1973,
+        "97": 1646,
+        "98": 1472,
+        "99": 1468,
+        "100": 1411,
+        "101": 1402,
+        "102": 1289,
+        "103": 1163,
+        "104": 983,
+        "105": 838,
+        "106": 777,
+        "107": 744,
+        "108": 689,
+        "109": 651,
+        "110": 651,
+        "111": 586,
+        "112": 523,
+        "113": 508,
+        "114": 464,
+        "115": 446,
+        "116": 428,
+        "117": 423,
+        "118": 390,
+        "119": 417,
+        "120": 363,
+        "121": 317,
+        "122": 316,
+        "123": 339,
+        "124": 340,
+        "125": 372,
+        "126": 372,
+        "127": 339,
+        "128": 403,
+        "129": 405,
+        "130": 428,
+        "131": 406,
+        "132": 409,
+        "133": 419,
+        "134": 396,
+        "135": 311,
+        "136": 288,
+        "137": 243,
+        "138": 195,
+        "139": 150,
+        "140": 158,
+        "141": 114,
+        "142": 105,
+        "143": 67,
+        "144": 33,
+        "145": 11,
+        "146": 7,
+        "147": 1
+    }
+}

counts/jhu_max.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+    "4": {
+        "max": 4,
+        "name": [
+            "0050.jpg"
+        ],
+        "x": 672,
+        "y": 1315
+    },
+    "7": {
+        "max": 7,
+        "name": [
+            "0154.jpg"
+        ],
+        "x": 338,
+        "y": 1337
+    },
+    "8": {
+        "max": 7,
+        "name": [
+            "0144.jpg"
+        ],
+        "x": 639,
+        "y": 943
+    },
+    "14": {
+        "max": 13,
+        "name": [
+            "1162.jpg"
+        ],
+        "x": 604,
+        "y": 702
+    },
+    "16": {
+        "max": 15,
+        "name": [
+            "0193.jpg"
+        ],
+        "x": 593,
+        "y": 286
+    },
+    "28": {
+        "max": 35,
+        "name": [
+            "1162.jpg"
+        ],
+        "x": 578,
+        "y": 706
+    },
+    "32": {
+        "max": 44,
+        "name": [
+            "0193.jpg"
+        ],
+        "x": 596,
+        "y": 263
+    },
+    "56": {
+        "max": 120,
+        "name": [
+            "1162.jpg"
+        ],
+        "x": 562,
+        "y": 671
+    },
+    "64": {
+        "max": 147,
+        "name": [
+            "1162.jpg"
+        ],
+        "x": 562,
+        "y": 663
+    }
+}

counts/nwpu.json ADDED Viewed

	@@ -0,0 +1,761 @@

+{
+    "1": {
+        "0": 14667500579,
+        "1": 1291229
+    },
+    "4": {
+        "0": 14607991573,
+        "1": 20424516,
+        "2": 101146,
+        "3": 3581,
+        "4": 173,
+        "5": 15,
+        "6": 1
+    },
+    "7": {
+        "0": 14527228244,
+        "1": 59342625,
+        "2": 1508064,
+        "3": 181138,
+        "4": 35423,
+        "5": 8206,
+        "6": 1925,
+        "7": 424,
+        "8": 92,
+        "9": 19,
+        "10": 4
+    },
+    "8": {
+        "0": 14496291716,
+        "1": 75535689,
+        "2": 2593492,
+        "3": 373522,
+        "4": 85180,
+        "5": 23129,
+        "6": 7605,
+        "7": 2404,
+        "8": 694,
+        "9": 170,
+        "10": 45,
+        "11": 7
+    },
+    "14": {
+        "0": 14280365725,
+        "1": 189868793,
+        "2": 17508005,
+        "3": 4140432,
+        "4": 1496968,
+        "5": 646243,
+        "6": 308292,
+        "7": 154512,
+        "8": 80925,
+        "9": 45696,
+        "10": 26811,
+        "11": 16841,
+        "12": 10489,
+        "13": 6798,
+        "14": 4437,
+        "15": 3038,
+        "16": 2097,
+        "17": 1426,
+        "18": 850,
+        "19": 434,
+        "20": 198,
+        "21": 105,
+        "22": 36,
+        "23": 14
+    },
+    "16": {
+        "0": 14200293041,
+        "1": 230337258,
+        "2": 25716807,
+        "3": 6591144,
+        "4": 2496616,
+        "5": 1151263,
+        "6": 597759,
+        "7": 328222,
+        "8": 186538,
+        "9": 107834,
+        "10": 64201,
+        "11": 40386,
+        "12": 26336,
+        "13": 17791,
+        "14": 12514,
+        "15": 8477,
+        "16": 6021,
+        "17": 4371,
+        "18": 3322,
+        "19": 2369,
+        "20": 1800,
+        "21": 1260,
+        "22": 892,
+        "23": 581,
+        "24": 317,
+        "25": 166,
+        "26": 88,
+        "27": 32,
+        "28": 5,
+        "29": 2
+    },
+    "28": {
+        "0": 13684329722,
+        "1": 456956241,
+        "2": 91566961,
+        "3": 34512257,
+        "4": 16402331,
+        "5": 8518065,
+        "6": 4898436,
+        "7": 3032957,
+        "8": 2020921,
+        "9": 1422203,
+        "10": 1041284,
+        "11": 785822,
+        "12": 600472,
+        "13": 463060,
+        "14": 356398,
+        "15": 278057,
+        "16": 220282,
+        "17": 175747,
+        "18": 141679,
+        "19": 115020,
+        "20": 92598,
+        "21": 75190,
+        "22": 61616,
+        "23": 50395,
+        "24": 40763,
+        "25": 33009,
+        "26": 26142,
+        "27": 21024,
+        "28": 16921,
+        "29": 14076,
+        "30": 11489,
+        "31": 10146,
+        "32": 8692,
+        "33": 7935,
+        "34": 7289,
+        "35": 6638,
+        "36": 5728,
+        "37": 5150,
+        "38": 4441,
+        "39": 3978,
+        "40": 3510,
+        "41": 3071,
+        "42": 2914,
+        "43": 2538,
+        "44": 2234,
+        "45": 1886,
+        "46": 1685,
+        "47": 1411,
+        "48": 1205,
+        "49": 1020,
+        "50": 817,
+        "51": 754,
+        "52": 696,
+        "53": 585,
+        "54": 540,
+        "55": 512,
+        "56": 444,
+        "57": 426,
+        "58": 364,
+        "59": 257,
+        "60": 212,
+        "61": 197,
+        "62": 157,
+        "63": 133,
+        "64": 108,
+        "65": 83,
+        "66": 95,
+        "67": 69,
+        "68": 64,
+        "69": 35,
+        "70": 21,
+        "71": 12,
+        "72": 9,
+        "73": 8,
+        "74": 3,
+        "75": 3
+    },
+    "32": {
+        "0": 13507181488,
+        "1": 523684788,
+        "2": 115677502,
+        "3": 46067053,
+        "4": 23384978,
+        "5": 13033305,
+        "6": 7798986,
+        "7": 4827879,
+        "8": 3222733,
+        "9": 2262098,
+        "10": 1651589,
+        "11": 1247118,
+        "12": 967386,
+        "13": 771426,
+        "14": 621546,
+        "15": 504368,
+        "16": 409418,
+        "17": 332421,
+        "18": 271277,
+        "19": 222138,
+        "20": 183772,
+        "21": 152433,
+        "22": 128423,
+        "23": 108428,
+        "24": 93487,
+        "25": 79093,
+        "26": 67728,
+        "27": 56196,
+        "28": 47634,
+        "29": 40579,
+        "30": 34355,
+        "31": 28984,
+        "32": 24565,
+        "33": 20972,
+        "34": 17931,
+        "35": 14995,
+        "36": 12377,
+        "37": 10307,
+        "38": 8797,
+        "39": 7610,
+        "40": 6846,
+        "41": 6271,
+        "42": 5855,
+        "43": 5378,
+        "44": 5294,
+        "45": 4945,
+        "46": 4528,
+        "47": 4172,
+        "48": 3883,
+        "49": 3522,
+        "50": 3246,
+        "51": 2948,
+        "52": 2646,
+        "53": 2401,
+        "54": 2102,
+        "55": 1889,
+        "56": 1689,
+        "57": 1444,
+        "58": 1354,
+        "59": 1166,
+        "60": 966,
+        "61": 796,
+        "62": 695,
+        "63": 629,
+        "64": 585,
+        "65": 531,
+        "66": 518,
+        "67": 482,
+        "68": 442,
+        "69": 385,
+        "70": 358,
+        "71": 335,
+        "72": 307,
+        "73": 267,
+        "74": 271,
+        "75": 220,
+        "76": 210,
+        "77": 180,
+        "78": 147,
+        "79": 124,
+        "80": 116,
+        "81": 112,
+        "82": 93,
+        "83": 69,
+        "84": 56,
+        "85": 52,
+        "86": 23,
+        "87": 17,
+        "88": 14,
+        "89": 11,
+        "90": 14,
+        "91": 6,
+        "92": 6,
+        "93": 3,
+        "94": 6,
+        "95": 1
+    },
+    "56": {
+        "0": 12465097246,
+        "1": 835084317,
+        "2": 254687121,
+        "3": 121720894,
+        "4": 71341732,
+        "5": 45465642,
+        "6": 31016406,
+        "7": 22117585,
+        "8": 16576017,
+        "9": 12843282,
+        "10": 10188871,
+        "11": 8166753,
+        "12": 6639505,
+        "13": 5403165,
+        "14": 4423601,
+        "15": 3641816,
+        "16": 2982294,
+        "17": 2495500,
+        "18": 2107822,
+        "19": 1777118,
+        "20": 1527177,
+        "21": 1320511,
+        "22": 1154409,
+        "23": 1016008,
+        "24": 902921,
+        "25": 805297,
+        "26": 717731,
+        "27": 639994,
+        "28": 578216,
+        "29": 522654,
+        "30": 471731,
+        "31": 430710,
+        "32": 391310,
+        "33": 360727,
+        "34": 333244,
+        "35": 306947,
+        "36": 285386,
+        "37": 266777,
+        "38": 248721,
+        "39": 231377,
+        "40": 213535,
+        "41": 197555,
+        "42": 182232,
+        "43": 168988,
+        "44": 156079,
+        "45": 144746,
+        "46": 135302,
+        "47": 124226,
+        "48": 114096,
+        "49": 104673,
+        "50": 95005,
+        "51": 87224,
+        "52": 81168,
+        "53": 76076,
+        "54": 71286,
+        "55": 67529,
+        "56": 64050,
+        "57": 62041,
+        "58": 58650,
+        "59": 55931,
+        "60": 51249,
+        "61": 47542,
+        "62": 44191,
+        "63": 41598,
+        "64": 38416,
+        "65": 36328,
+        "66": 33839,
+        "67": 32088,
+        "68": 30559,
+        "69": 27881,
+        "70": 26103,
+        "71": 24152,
+        "72": 22520,
+        "73": 20886,
+        "74": 19169,
+        "75": 17738,
+        "76": 16636,
+        "77": 15532,
+        "78": 14619,
+        "79": 14389,
+        "80": 13560,
+        "81": 13208,
+        "82": 12245,
+        "83": 11275,
+        "84": 10523,
+        "85": 10108,
+        "86": 9176,
+        "87": 8790,
+        "88": 8448,
+        "89": 8110,
+        "90": 7575,
+        "91": 7354,
+        "92": 6483,
+        "93": 6061,
+        "94": 5352,
+        "95": 5181,
+        "96": 4845,
+        "97": 4594,
+        "98": 4342,
+        "99": 4193,
+        "100": 3899,
+        "101": 3674,
+        "102": 3565,
+        "103": 3285,
+        "104": 3059,
+        "105": 2778,
+        "106": 2658,
+        "107": 2485,
+        "108": 2345,
+        "109": 2303,
+        "110": 2210,
+        "111": 2095,
+        "112": 1975,
+        "113": 1975,
+        "114": 2058,
+        "115": 1969,
+        "116": 1914,
+        "117": 1934,
+        "118": 1928,
+        "119": 1914,
+        "120": 1954,
+        "121": 1943,
+        "122": 1997,
+        "123": 2085,
+        "124": 1841,
+        "125": 1728,
+        "126": 1603,
+        "127": 1530,
+        "128": 1426,
+        "129": 1355,
+        "130": 1309,
+        "131": 1340,
+        "132": 1256,
+        "133": 1260,
+        "134": 1219,
+        "135": 1086,
+        "136": 1079,
+        "137": 1004,
+        "138": 987,
+        "139": 996,
+        "140": 886,
+        "141": 841,
+        "142": 786,
+        "143": 799,
+        "144": 882,
+        "145": 782,
+        "146": 718,
+        "147": 672,
+        "148": 629,
+        "149": 578,
+        "150": 592,
+        "151": 602,
+        "152": 564,
+        "153": 573,
+        "154": 551,
+        "155": 484,
+        "156": 474,
+        "157": 435,
+        "158": 410,
+        "159": 376,
+        "160": 348,
+        "161": 366,
+        "162": 299,
+        "163": 304,
+        "164": 280,
+        "165": 301,
+        "166": 298,
+        "167": 266,
+        "168": 259,
+        "169": 288,
+        "170": 259,
+        "171": 232,
+        "172": 249,
+        "173": 229,
+        "174": 197,
+        "175": 254,
+        "176": 204,
+        "177": 211,
+        "178": 208,
+        "179": 199,
+        "180": 183,
+        "181": 169,
+        "182": 169,
+        "183": 169,
+        "184": 120,
+        "185": 119,
+        "186": 151,
+        "187": 131,
+        "188": 126,
+        "189": 122,
+        "190": 107,
+        "191": 105,
+        "192": 103,
+        "193": 87,
+        "194": 71,
+        "195": 62,
+        "196": 59,
+        "197": 51,
+        "198": 40,
+        "199": 49,
+        "200": 44,
+        "201": 45,
+        "202": 43,
+        "203": 42,
+        "204": 36,
+        "205": 45,
+        "206": 36,
+        "207": 37,
+        "208": 38,
+        "209": 32,
+        "210": 27,
+        "211": 25,
+        "212": 21,
+        "213": 19,
+        "214": 30,
+        "215": 16,
+        "216": 20,
+        "217": 15,
+        "218": 14,
+        "219": 6,
+        "220": 8,
+        "221": 5,
+        "222": 3,
+        "223": 2
+    },
+    "64": {
+        "0": 12134170560,
+        "1": 910355445,
+        "2": 297133671,
+        "3": 145184087,
+        "4": 87626341,
+        "5": 57746135,
+        "6": 40495922,
+        "7": 29156512,
+        "8": 21919906,
+        "9": 16973043,
+        "10": 13535308,
+        "11": 11038546,
+        "12": 9149626,
+        "13": 7600687,
+        "14": 6410824,
+        "15": 5491781,
+        "16": 4677502,
+        "17": 3997198,
+        "18": 3443407,
+        "19": 2925959,
+        "20": 2507301,
+        "21": 2160448,
+        "22": 1878716,
+        "23": 1648075,
+        "24": 1450872,
+        "25": 1275043,
+        "26": 1133498,
+        "27": 1015835,
+        "28": 914243,
+        "29": 833304,
+        "30": 760872,
+        "31": 691863,
+        "32": 630584,
+        "33": 577966,
+        "34": 528643,
+        "35": 485362,
+        "36": 444354,
+        "37": 407675,
+        "38": 377100,
+        "39": 351641,
+        "40": 326893,
+        "41": 305689,
+        "42": 285689,
+        "43": 266757,
+        "44": 249514,
+        "45": 235532,
+        "46": 223892,
+        "47": 211932,
+        "48": 200323,
+        "49": 189578,
+        "50": 178068,
+        "51": 167402,
+        "52": 158785,
+        "53": 149971,
+        "54": 140597,
+        "55": 131198,
+        "56": 124442,
+        "57": 118109,
+        "58": 111071,
+        "59": 104882,
+        "60": 97607,
+        "61": 91490,
+        "62": 85286,
+        "63": 79531,
+        "64": 74921,
+        "65": 69722,
+        "66": 67061,
+        "67": 62855,
+        "68": 59431,
+        "69": 56425,
+        "70": 53389,
+        "71": 52205,
+        "72": 49130,
+        "73": 47540,
+        "74": 46130,
+        "75": 44031,
+        "76": 41069,
+        "77": 38590,
+        "78": 36372,
+        "79": 34739,
+        "80": 32483,
+        "81": 30821,
+        "82": 29084,
+        "83": 27658,
+        "84": 26356,
+        "85": 25296,
+        "86": 24161,
+        "87": 22766,
+        "88": 21596,
+        "89": 20576,
+        "90": 19734,
+        "91": 18715,
+        "92": 17676,
+        "93": 16389,
+        "94": 15235,
+        "95": 14115,
+        "96": 13051,
+        "97": 12336,
+        "98": 11769,
+        "99": 10974,
+        "100": 10731,
+        "101": 9897,
+        "102": 9661,
+        "103": 9456,
+        "104": 9255,
+        "105": 9143,
+        "106": 8863,
+        "107": 8535,
+        "108": 8059,
+        "109": 7377,
+        "110": 7024,
+        "111": 6470,
+        "112": 6426,
+        "113": 6009,
+        "114": 5748,
+        "115": 5535,
+        "116": 5244,
+        "117": 4876,
+        "118": 4586,
+        "119": 4234,
+        "120": 4118,
+        "121": 3789,
+        "122": 3695,
+        "123": 3622,
+        "124": 3493,
+        "125": 3318,
+        "126": 3359,
+        "127": 3420,
+        "128": 3353,
+        "129": 3224,
+        "130": 3222,
+        "131": 3038,
+        "132": 2831,
+        "133": 2743,
+        "134": 2751,
+        "135": 2703,
+        "136": 2517,
+        "137": 2404,
+        "138": 2360,
+        "139": 2069,
+        "140": 2037,
+        "141": 1829,
+        "142": 1693,
+        "143": 1599,
+        "144": 1588,
+        "145": 1482,
+        "146": 1408,
+        "147": 1386,
+        "148": 1339,
+        "149": 1401,
+        "150": 1313,
+        "151": 1276,
+        "152": 1276,
+        "153": 1179,
+        "154": 1242,
+        "155": 1267,
+        "156": 1184,
+        "157": 1245,
+        "158": 1187,
+        "159": 1113,
+        "160": 1095,
+        "161": 1059,
+        "162": 938,
+        "163": 958,
+        "164": 906,
+        "165": 920,
+        "166": 941,
+        "167": 905,
+        "168": 885,
+        "169": 873,
+        "170": 794,
+        "171": 741,
+        "172": 773,
+        "173": 713,
+        "174": 694,
+        "175": 689,
+        "176": 741,
+        "177": 770,
+        "178": 735,
+        "179": 747,
+        "180": 704,
+        "181": 670,
+        "182": 652,
+        "183": 635,
+        "184": 633,
+        "185": 682,
+        "186": 598,
+        "187": 590,
+        "188": 541,
+        "189": 526,
+        "190": 495,
+        "191": 508,
+        "192": 492,
+        "193": 501,
+        "194": 443,
+        "195": 444,
+        "196": 399,
+        "197": 363,
+        "198": 357,
+        "199": 338,
+        "200": 292,
+        "201": 273,
+        "202": 288,
+        "203": 292,
+        "204": 280,
+        "205": 260,
+        "206": 278,
+        "207": 243,
+        "208": 212,
+        "209": 241,
+        "210": 217,
+        "211": 189,
+        "212": 195,
+        "213": 181,
+        "214": 179,
+        "215": 238,
+        "216": 196,
+        "217": 195,
+        "218": 181,
+        "219": 191,
+        "220": 158,
+        "221": 154,
+        "222": 178,
+        "223": 150,
+        "224": 149,
+        "225": 155,
+        "226": 184,
+        "227": 125,
+        "228": 154,
+        "229": 135,
+        "230": 153,
+        "231": 151,
+        "232": 153,
+        "233": 124,
+        "234": 110,
+        "235": 87,
+        "236": 95,
+        "237": 76,
+        "238": 75,
+        "239": 69,
+        "240": 67,
+        "241": 60,
+        "242": 36,
+        "243": 42,
+        "244": 55,
+        "245": 41,
+        "246": 58,
+        "247": 46,
+        "248": 37,
+        "249": 33,
+        "250": 29,
+        "251": 23,
+        "252": 13,
+        "253": 3,
+        "254": 11,
+        "255": 9,
+        "256": 2
+    }
+}

counts/nwpu_max.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+    "4": {
+        "max": 6,
+        "name": [
+            "0701.jpg"
+        ],
+        "x": 976,
+        "y": 1527
+    },
+    "7": {
+        "max": 10,
+        "name": [
+            "0181.jpg"
+        ],
+        "x": 639,
+        "y": 1531
+    },
+    "8": {
+        "max": 11,
+        "name": [
+            "1838.jpg"
+        ],
+        "x": 815,
+        "y": 1001
+    },
+    "14": {
+        "max": 23,
+        "name": [
+            "1838.jpg"
+        ],
+        "x": 995,
+        "y": 1544
+    },
+    "16": {
+        "max": 29,
+        "name": [
+            "1838.jpg"
+        ],
+        "x": 991,
+        "y": 1544
+    },
+    "28": {
+        "max": 75,
+        "name": [
+            "1838.jpg"
+        ],
+        "x": 1003,
+        "y": 1706
+    },
+    "32": {
+        "max": 95,
+        "name": [
+            "1838.jpg"
+        ],
+        "x": 1003,
+        "y": 1704
+    },
+    "56": {
+        "max": 223,
+        "name": [
+            "1838.jpg"
+        ],
+        "x": 993,
+        "y": 1702
+    },
+    "64": {
+        "max": 256,
+        "name": [
+            "1838.jpg"
+        ],
+        "x": 990,
+        "y": 1697
+    }
+}

counts/qnrf.json ADDED Viewed

	@@ -0,0 +1,569 @@

+{
+    "1": {
+        "0": 2703096261,
+        "1": 1007163
+    },
+    "4": {
+        "0": 2677404968,
+        "1": 15969215,
+        "2": 59807,
+        "3": 1384,
+        "4": 97,
+        "5": 10
+    },
+    "7": {
+        "0": 2635421382,
+        "1": 45742892,
+        "2": 1492537,
+        "3": 114192,
+        "4": 14549,
+        "5": 2676,
+        "6": 675,
+        "7": 199,
+        "8": 47,
+        "9": 7
+    },
+    "8": {
+        "0": 2618651922,
+        "1": 57473873,
+        "2": 2778844,
+        "3": 286508,
+        "4": 41982,
+        "5": 8626,
+        "6": 2306,
+        "7": 782,
+        "8": 241,
+        "9": 77,
+        "10": 21,
+        "11": 3
+    },
+    "14": {
+        "0": 2502680139,
+        "1": 128220308,
+        "2": 19112473,
+        "3": 5245278,
+        "4": 1729894,
+        "5": 624274,
+        "6": 238250,
+        "7": 97230,
+        "8": 41347,
+        "9": 19325,
+        "10": 9833,
+        "11": 5696,
+        "12": 3361,
+        "13": 1972,
+        "14": 1035,
+        "15": 547,
+        "16": 340,
+        "17": 212,
+        "18": 112,
+        "19": 71,
+        "20": 42,
+        "21": 30,
+        "22": 14,
+        "23": 12,
+        "24": 3,
+        "25": 3
+    },
+    "16": {
+        "0": 2461366525,
+        "1": 149295686,
+        "2": 26297062,
+        "3": 8379921,
+        "4": 3199756,
+        "5": 1324146,
+        "6": 583234,
+        "7": 267593,
+        "8": 128122,
+        "9": 62843,
+        "10": 32265,
+        "11": 16540,
+        "12": 9297,
+        "13": 5835,
+        "14": 4037,
+        "15": 2616,
+        "16": 1660,
+        "17": 1066,
+        "18": 639,
+        "19": 349,
+        "20": 203,
+        "21": 183,
+        "22": 121,
+        "23": 80,
+        "24": 44,
+        "25": 30,
+        "26": 11,
+        "27": 9,
+        "28": 10,
+        "29": 6
+    },
+    "28": {
+        "0": 2217981619,
+        "1": 242958596,
+        "2": 68708089,
+        "3": 31034654,
+        "4": 17007626,
+        "5": 10317353,
+        "6": 6556090,
+        "7": 4298832,
+        "8": 2899688,
+        "9": 2014576,
+        "10": 1411981,
+        "11": 1007963,
+        "12": 718139,
+        "13": 516552,
+        "14": 375188,
+        "15": 273595,
+        "16": 199599,
+        "17": 144002,
+        "18": 106107,
+        "19": 79309,
+        "20": 60015,
+        "21": 45839,
+        "22": 35538,
+        "23": 27006,
+        "24": 21141,
+        "25": 16063,
+        "26": 11666,
+        "27": 8786,
+        "28": 6812,
+        "29": 5341,
+        "30": 4314,
+        "31": 3339,
+        "32": 2718,
+        "33": 2165,
+        "34": 1611,
+        "35": 1444,
+        "36": 1299,
+        "37": 1057,
+        "38": 930,
+        "39": 804,
+        "40": 590,
+        "41": 475,
+        "42": 361,
+        "43": 297,
+        "44": 242,
+        "45": 166,
+        "46": 125,
+        "47": 108,
+        "48": 81,
+        "49": 90,
+        "50": 74,
+        "51": 46,
+        "52": 38,
+        "53": 24,
+        "54": 13,
+        "55": 7,
+        "56": 2
+    },
+    "32": {
+        "0": 2142175706,
+        "1": 263796436,
+        "2": 81041731,
+        "3": 38689542,
+        "4": 21861748,
+        "5": 13731448,
+        "6": 9285756,
+        "7": 6423842,
+        "8": 4541778,
+        "9": 3251892,
+        "10": 2387659,
+        "11": 1795213,
+        "12": 1359736,
+        "13": 1036607,
+        "14": 790266,
+        "15": 606226,
+        "16": 469220,
+        "17": 361731,
+        "18": 281834,
+        "19": 218860,
+        "20": 168254,
+        "21": 130270,
+        "22": 100263,
+        "23": 78196,
+        "24": 61822,
+        "25": 49558,
+        "26": 39186,
+        "27": 32271,
+        "28": 26464,
+        "29": 21939,
+        "30": 17726,
+        "31": 14747,
+        "32": 11705,
+        "33": 9539,
+        "34": 7368,
+        "35": 5935,
+        "36": 4774,
+        "37": 3727,
+        "38": 3275,
+        "39": 2605,
+        "40": 2408,
+        "41": 1893,
+        "42": 1440,
+        "43": 1278,
+        "44": 1070,
+        "45": 915,
+        "46": 740,
+        "47": 619,
+        "48": 507,
+        "49": 484,
+        "50": 330,
+        "51": 374,
+        "52": 287,
+        "53": 244,
+        "54": 223,
+        "55": 186,
+        "56": 136,
+        "57": 120,
+        "58": 103,
+        "59": 100,
+        "60": 88,
+        "61": 35,
+        "62": 16,
+        "63": 23,
+        "64": 3,
+        "65": 4
+    },
+    "56": {
+        "0": 1753079415,
+        "1": 326506415,
+        "2": 132989783,
+        "3": 73569912,
+        "4": 47875219,
+        "5": 33905462,
+        "6": 25324896,
+        "7": 19351405,
+        "8": 14991521,
+        "9": 11859011,
+        "10": 9643110,
+        "11": 7988193,
+        "12": 6724565,
+        "13": 5737528,
+        "14": 4905936,
+        "15": 4234117,
+        "16": 3678710,
+        "17": 3233842,
+        "18": 2856180,
+        "19": 2528729,
+        "20": 2238483,
+        "21": 1974680,
+        "22": 1738379,
+        "23": 1522952,
+        "24": 1334635,
+        "25": 1171843,
+        "26": 1038446,
+        "27": 924884,
+        "28": 828510,
+        "29": 749323,
+        "30": 680155,
+        "31": 619173,
+        "32": 558209,
+        "33": 507896,
+        "34": 463642,
+        "35": 419398,
+        "36": 380125,
+        "37": 347601,
+        "38": 318828,
+        "39": 293043,
+        "40": 272483,
+        "41": 250724,
+        "42": 228696,
+        "43": 206140,
+        "44": 184636,
+        "45": 165534,
+        "46": 149696,
+        "47": 135099,
+        "48": 121824,
+        "49": 110240,
+        "50": 98425,
+        "51": 88515,
+        "52": 79279,
+        "53": 70978,
+        "54": 64994,
+        "55": 59099,
+        "56": 53268,
+        "57": 48134,
+        "58": 43611,
+        "59": 38300,
+        "60": 34909,
+        "61": 31681,
+        "62": 28393,
+        "63": 24688,
+        "64": 21934,
+        "65": 19803,
+        "66": 17598,
+        "67": 15593,
+        "68": 14189,
+        "69": 13168,
+        "70": 12483,
+        "71": 11762,
+        "72": 11066,
+        "73": 10447,
+        "74": 9606,
+        "75": 8747,
+        "76": 7574,
+        "77": 6921,
+        "78": 6340,
+        "79": 6088,
+        "80": 5448,
+        "81": 5380,
+        "82": 5144,
+        "83": 5114,
+        "84": 4775,
+        "85": 4632,
+        "86": 4332,
+        "87": 4082,
+        "88": 3949,
+        "89": 3821,
+        "90": 3476,
+        "91": 3406,
+        "92": 2973,
+        "93": 2766,
+        "94": 2489,
+        "95": 2253,
+        "96": 2087,
+        "97": 1763,
+        "98": 1560,
+        "99": 1322,
+        "100": 1243,
+        "101": 1150,
+        "102": 994,
+        "103": 794,
+        "104": 589,
+        "105": 538,
+        "106": 416,
+        "107": 359,
+        "108": 335,
+        "109": 309,
+        "110": 310,
+        "111": 280,
+        "112": 269,
+        "113": 279,
+        "114": 233,
+        "115": 198,
+        "116": 208,
+        "117": 211,
+        "118": 166,
+        "119": 128,
+        "120": 127,
+        "121": 119,
+        "122": 145,
+        "123": 159,
+        "124": 130,
+        "125": 115,
+        "126": 124,
+        "127": 132,
+        "128": 130,
+        "129": 114,
+        "130": 136,
+        "131": 113,
+        "132": 119,
+        "133": 92,
+        "134": 109,
+        "135": 94,
+        "136": 112,
+        "137": 108,
+        "138": 107,
+        "139": 114,
+        "140": 102,
+        "141": 63,
+        "142": 43,
+        "143": 46,
+        "144": 34,
+        "145": 17,
+        "146": 17,
+        "147": 4,
+        "148": 4
+    },
+    "64": {
+        "0": 1645580394,
+        "1": 332121950,
+        "2": 143857376,
+        "3": 82342244,
+        "4": 54254902,
+        "5": 38847202,
+        "6": 29417465,
+        "7": 23205846,
+        "8": 18694855,
+        "9": 15141642,
+        "10": 12371576,
+        "11": 10229329,
+        "12": 8647553,
+        "13": 7325344,
+        "14": 6295327,
+        "15": 5516930,
+        "16": 4865082,
+        "17": 4309391,
+        "18": 3842162,
+        "19": 3406684,
+        "20": 3033028,
+        "21": 2735522,
+        "22": 2473336,
+        "23": 2242708,
+        "24": 2042061,
+        "25": 1862630,
+        "26": 1687997,
+        "27": 1529651,
+        "28": 1377678,
+        "29": 1246699,
+        "30": 1127615,
+        "31": 1021519,
+        "32": 919786,
+        "33": 835229,
+        "34": 758589,
+        "35": 694245,
+        "36": 637642,
+        "37": 589662,
+        "38": 547952,
+        "39": 507110,
+        "40": 467377,
+        "41": 431426,
+        "42": 399251,
+        "43": 369645,
+        "44": 345626,
+        "45": 320928,
+        "46": 300584,
+        "47": 279405,
+        "48": 261128,
+        "49": 245246,
+        "50": 230330,
+        "51": 216329,
+        "52": 202315,
+        "53": 188342,
+        "54": 175479,
+        "55": 164216,
+        "56": 151015,
+        "57": 138762,
+        "58": 128074,
+        "59": 118213,
+        "60": 109407,
+        "61": 100053,
+        "62": 91903,
+        "63": 83292,
+        "64": 75832,
+        "65": 68006,
+        "66": 61400,
+        "67": 55742,
+        "68": 51271,
+        "69": 47305,
+        "70": 43974,
+        "71": 39955,
+        "72": 36911,
+        "73": 34035,
+        "74": 30928,
+        "75": 28558,
+        "76": 26104,
+        "77": 24211,
+        "78": 22590,
+        "79": 20897,
+        "80": 19153,
+        "81": 17657,
+        "82": 16849,
+        "83": 15301,
+        "84": 14235,
+        "85": 13049,
+        "86": 11929,
+        "87": 10779,
+        "88": 9912,
+        "89": 9146,
+        "90": 8247,
+        "91": 7534,
+        "92": 7104,
+        "93": 6609,
+        "94": 6159,
+        "95": 5758,
+        "96": 5510,
+        "97": 5528,
+        "98": 5293,
+        "99": 4973,
+        "100": 4606,
+        "101": 4275,
+        "102": 4271,
+        "103": 4037,
+        "104": 3971,
+        "105": 3787,
+        "106": 3970,
+        "107": 3630,
+        "108": 3605,
+        "109": 3351,
+        "110": 3229,
+        "111": 2970,
+        "112": 2963,
+        "113": 3005,
+        "114": 2790,
+        "115": 2728,
+        "116": 2547,
+        "117": 2315,
+        "118": 2133,
+        "119": 1910,
+        "120": 1701,
+        "121": 1579,
+        "122": 1382,
+        "123": 1253,
+        "124": 1198,
+        "125": 1048,
+        "126": 901,
+        "127": 847,
+        "128": 761,
+        "129": 656,
+        "130": 559,
+        "131": 543,
+        "132": 509,
+        "133": 497,
+        "134": 357,
+        "135": 353,
+        "136": 321,
+        "137": 252,
+        "138": 262,
+        "139": 215,
+        "140": 175,
+        "141": 188,
+        "142": 141,
+        "143": 138,
+        "144": 124,
+        "145": 141,
+        "146": 146,
+        "147": 147,
+        "148": 149,
+        "149": 156,
+        "150": 150,
+        "151": 122,
+        "152": 122,
+        "153": 118,
+        "154": 115,
+        "155": 142,
+        "156": 127,
+        "157": 105,
+        "158": 108,
+        "159": 96,
+        "160": 111,
+        "161": 106,
+        "162": 106,
+        "163": 99,
+        "164": 112,
+        "165": 108,
+        "166": 97,
+        "167": 106,
+        "168": 102,
+        "169": 109,
+        "170": 81,
+        "171": 118,
+        "172": 79,
+        "173": 67,
+        "174": 86,
+        "175": 34,
+        "176": 44,
+        "177": 22,
+        "178": 20,
+        "179": 20,
+        "180": 22,
+        "181": 6,
+        "182": 18,
+        "183": 12,
+        "184": 4,
+        "185": 6,
+        "186": 2,
+        "187": 2,
+        "188": 2
+    }
+}

counts/qnrf_max.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+    "4": {
+        "max": 5,
+        "name": [
+            "0862.jpg"
+        ],
+        "x": 525,
+        "y": 892
+    },
+    "7": {
+        "max": 9,
+        "name": [
+            "0215.jpg"
+        ],
+        "x": 339,
+        "y": 701
+    },
+    "8": {
+        "max": 11,
+        "name": [
+            "0215.jpg"
+        ],
+        "x": 339,
+        "y": 701
+    },
+    "14": {
+        "max": 25,
+        "name": [
+            "0215.jpg"
+        ],
+        "x": 332,
+        "y": 697
+    },
+    "16": {
+        "max": 29,
+        "name": [
+            "0215.jpg"
+        ],
+        "x": 331,
+        "y": 697
+    },
+    "28": {
+        "max": 56,
+        "name": [
+            "0330.jpg"
+        ],
+        "x": 336,
+        "y": 1063
+    },
+    "32": {
+        "max": 65,
+        "name": [
+            "0931.jpg"
+        ],
+        "x": 730,
+        "y": 1077
+    },
+    "56": {
+        "max": 148,
+        "name": [
+            "0931.jpg"
+        ],
+        "x": 725,
+        "y": 1084
+    },
+    "64": {
+        "max": 188,
+        "name": [
+            "0931.jpg"
+        ],
+        "x": 702,
+        "y": 1078
+    }
+}

counts/sha.json ADDED Viewed

	@@ -0,0 +1,578 @@

+{
+    "1": {
+        "0": 221398495,
+        "1": 162337
+    },
+    "4": {
+        "0": 217404440,
+        "1": 2560651,
+        "2": 15919,
+        "3": 525,
+        "4": 13
+    },
+    "7": {
+        "0": 210823747,
+        "1": 7296555,
+        "2": 246197,
+        "3": 33928,
+        "4": 5933,
+        "5": 1100,
+        "6": 185,
+        "7": 17,
+        "8": 2
+    },
+    "8": {
+        "0": 208176274,
+        "1": 9201723,
+        "2": 414603,
+        "3": 70943,
+        "4": 15652,
+        "5": 3870,
+        "6": 919,
+        "7": 204,
+        "8": 33,
+        "9": 13,
+        "10": 2
+    },
+    "14": {
+        "0": 189475328,
+        "1": 21483041,
+        "2": 2585228,
+        "3": 687491,
+        "4": 268045,
+        "5": 123917,
+        "6": 62564,
+        "7": 32805,
+        "8": 17268,
+        "9": 9346,
+        "10": 5150,
+        "11": 2866,
+        "12": 1541,
+        "13": 822,
+        "14": 463,
+        "15": 198,
+        "16": 99,
+        "17": 48,
+        "18": 30,
+        "19": 16,
+        "20": 2
+    },
+    "16": {
+        "0": 182715184,
+        "1": 25283086,
+        "2": 3723263,
+        "3": 1075878,
+        "4": 428688,
+        "5": 212379,
+        "6": 116019,
+        "7": 66336,
+        "8": 38808,
+        "9": 22851,
+        "10": 13845,
+        "11": 8410,
+        "12": 5248,
+        "13": 3355,
+        "14": 1997,
+        "15": 1309,
+        "16": 818,
+        "17": 451,
+        "18": 238,
+        "19": 113,
+        "20": 65,
+        "21": 31,
+        "22": 16,
+        "23": 8,
+        "24": 8,
+        "25": 4,
+        "26": 2,
+        "27": 1,
+        "28": 1
+    },
+    "28": {
+        "0": 143735006,
+        "1": 40200526,
+        "2": 11837381,
+        "3": 4979488,
+        "4": 2499799,
+        "5": 1387540,
+        "6": 843561,
+        "7": 542333,
+        "8": 361946,
+        "9": 254697,
+        "10": 190403,
+        "11": 145704,
+        "12": 112559,
+        "13": 88972,
+        "14": 69603,
+        "15": 55162,
+        "16": 44351,
+        "17": 36430,
+        "18": 29187,
+        "19": 23408,
+        "20": 18831,
+        "21": 14678,
+        "22": 11890,
+        "23": 9916,
+        "24": 8375,
+        "25": 6759,
+        "26": 5676,
+        "27": 4713,
+        "28": 3932,
+        "29": 3328,
+        "30": 2705,
+        "31": 2351,
+        "32": 1976,
+        "33": 1691,
+        "34": 1450,
+        "35": 1133,
+        "36": 949,
+        "37": 790,
+        "38": 657,
+        "39": 489,
+        "40": 324,
+        "41": 244,
+        "42": 170,
+        "43": 144,
+        "44": 122,
+        "45": 85,
+        "46": 63,
+        "47": 48,
+        "48": 44,
+        "49": 38,
+        "50": 16,
+        "51": 13,
+        "52": 14,
+        "53": 4,
+        "54": 1,
+        "55": 1
+    },
+    "32": {
+        "0": 132333123,
+        "1": 42343779,
+        "2": 14212292,
+        "3": 6506905,
+        "4": 3472682,
+        "5": 2008349,
+        "6": 1259746,
+        "7": 835061,
+        "8": 580977,
+        "9": 411224,
+        "10": 298598,
+        "11": 226760,
+        "12": 177966,
+        "13": 142932,
+        "14": 116283,
+        "15": 95533,
+        "16": 78053,
+        "17": 64149,
+        "18": 52398,
+        "19": 43187,
+        "20": 36642,
+        "21": 31022,
+        "22": 26409,
+        "23": 22474,
+        "24": 19080,
+        "25": 15785,
+        "26": 12983,
+        "27": 10905,
+        "28": 9540,
+        "29": 8242,
+        "30": 7113,
+        "31": 5838,
+        "32": 4817,
+        "33": 4147,
+        "34": 3635,
+        "35": 3160,
+        "36": 2800,
+        "37": 2258,
+        "38": 2086,
+        "39": 1884,
+        "40": 1789,
+        "41": 1749,
+        "42": 1451,
+        "43": 1284,
+        "44": 1097,
+        "45": 849,
+        "46": 631,
+        "47": 498,
+        "48": 324,
+        "49": 294,
+        "50": 208,
+        "51": 157,
+        "52": 136,
+        "53": 119,
+        "54": 80,
+        "55": 73,
+        "56": 68,
+        "57": 58,
+        "58": 61,
+        "59": 69,
+        "60": 49,
+        "61": 40,
+        "62": 24,
+        "63": 15,
+        "64": 17,
+        "65": 5,
+        "66": 2
+    },
+    "56": {
+        "0": 82311154,
+        "1": 39190874,
+        "2": 22002574,
+        "3": 12962751,
+        "4": 8404996,
+        "5": 5832072,
+        "6": 4289203,
+        "7": 3257294,
+        "8": 2542514,
+        "9": 1997552,
+        "10": 1600093,
+        "11": 1286154,
+        "12": 1053852,
+        "13": 868200,
+        "14": 718305,
+        "15": 598864,
+        "16": 498449,
+        "17": 418687,
+        "18": 358697,
+        "19": 312381,
+        "20": 276011,
+        "21": 241729,
+        "22": 215353,
+        "23": 195921,
+        "24": 175559,
+        "25": 159251,
+        "26": 141084,
+        "27": 128022,
+        "28": 114886,
+        "29": 104495,
+        "30": 95802,
+        "31": 87751,
+        "32": 79668,
+        "33": 72856,
+        "34": 67187,
+        "35": 60598,
+        "36": 56041,
+        "37": 49833,
+        "38": 45739,
+        "39": 42100,
+        "40": 38922,
+        "41": 35683,
+        "42": 33222,
+        "43": 31037,
+        "44": 27306,
+        "45": 24412,
+        "46": 21939,
+        "47": 20087,
+        "48": 18312,
+        "49": 17285,
+        "50": 16026,
+        "51": 14905,
+        "52": 14599,
+        "53": 13990,
+        "54": 13420,
+        "55": 12785,
+        "56": 11938,
+        "57": 11445,
+        "58": 11094,
+        "59": 10387,
+        "60": 9826,
+        "61": 9605,
+        "62": 9270,
+        "63": 8533,
+        "64": 8157,
+        "65": 7849,
+        "66": 7121,
+        "67": 6586,
+        "68": 6083,
+        "69": 5424,
+        "70": 4978,
+        "71": 4867,
+        "72": 4364,
+        "73": 3995,
+        "74": 3771,
+        "75": 3567,
+        "76": 3107,
+        "77": 2871,
+        "78": 2630,
+        "79": 2162,
+        "80": 2096,
+        "81": 1907,
+        "82": 1872,
+        "83": 1792,
+        "84": 1838,
+        "85": 1703,
+        "86": 1629,
+        "87": 1545,
+        "88": 1388,
+        "89": 1298,
+        "90": 1310,
+        "91": 1258,
+        "92": 1175,
+        "93": 1174,
+        "94": 1013,
+        "95": 976,
+        "96": 856,
+        "97": 784,
+        "98": 711,
+        "99": 692,
+        "100": 697,
+        "101": 622,
+        "102": 639,
+        "103": 544,
+        "104": 531,
+        "105": 476,
+        "106": 481,
+        "107": 450,
+        "108": 443,
+        "109": 439,
+        "110": 443,
+        "111": 358,
+        "112": 337,
+        "113": 293,
+        "114": 264,
+        "115": 223,
+        "116": 177,
+        "117": 140,
+        "118": 143,
+        "119": 124,
+        "120": 118,
+        "121": 104,
+        "122": 100,
+        "123": 96,
+        "124": 94,
+        "125": 92,
+        "126": 69,
+        "127": 89,
+        "128": 91,
+        "129": 85,
+        "130": 70,
+        "131": 66,
+        "132": 51,
+        "133": 54,
+        "134": 77,
+        "135": 60,
+        "136": 69,
+        "137": 62,
+        "138": 75,
+        "139": 83,
+        "140": 84,
+        "141": 77,
+        "142": 63,
+        "143": 51,
+        "144": 51,
+        "145": 68,
+        "146": 44,
+        "147": 45,
+        "148": 35,
+        "149": 38,
+        "150": 39,
+        "151": 39,
+        "152": 22,
+        "153": 12,
+        "154": 19,
+        "155": 24,
+        "156": 15,
+        "157": 4,
+        "158": 3,
+        "159": 1
+    },
+    "64": {
+        "0": 71204848,
+        "1": 35431716,
+        "2": 22345768,
+        "3": 14110543,
+        "4": 9458039,
+        "5": 6781297,
+        "6": 5068480,
+        "7": 3922313,
+        "8": 3115679,
+        "9": 2546969,
+        "10": 2092914,
+        "11": 1728554,
+        "12": 1445669,
+        "13": 1226006,
+        "14": 1027888,
+        "15": 880413,
+        "16": 758676,
+        "17": 651263,
+        "18": 560175,
+        "19": 481484,
+        "20": 415366,
+        "21": 360995,
+        "22": 319926,
+        "23": 281587,
+        "24": 249589,
+        "25": 222763,
+        "26": 201505,
+        "27": 186993,
+        "28": 172894,
+        "29": 160066,
+        "30": 148490,
+        "31": 135929,
+        "32": 125730,
+        "33": 116554,
+        "34": 109632,
+        "35": 101625,
+        "36": 93920,
+        "37": 86856,
+        "38": 80031,
+        "39": 73701,
+        "40": 68720,
+        "41": 62813,
+        "42": 58001,
+        "43": 53537,
+        "44": 49124,
+        "45": 45340,
+        "46": 42598,
+        "47": 39746,
+        "48": 37319,
+        "49": 35173,
+        "50": 32861,
+        "51": 29710,
+        "52": 27037,
+        "53": 24220,
+        "54": 22338,
+        "55": 20642,
+        "56": 19097,
+        "57": 17737,
+        "58": 16334,
+        "59": 16276,
+        "60": 15705,
+        "61": 14837,
+        "62": 13992,
+        "63": 13180,
+        "64": 12950,
+        "65": 12540,
+        "66": 12527,
+        "67": 12219,
+        "68": 11564,
+        "69": 10978,
+        "70": 10465,
+        "71": 9857,
+        "72": 9330,
+        "73": 9088,
+        "74": 8851,
+        "75": 8715,
+        "76": 8399,
+        "77": 7778,
+        "78": 7275,
+        "79": 6728,
+        "80": 6557,
+        "81": 6062,
+        "82": 5907,
+        "83": 5520,
+        "84": 5272,
+        "85": 4972,
+        "86": 4439,
+        "87": 3988,
+        "88": 3607,
+        "89": 3342,
+        "90": 3260,
+        "91": 3148,
+        "92": 2978,
+        "93": 3015,
+        "94": 2783,
+        "95": 2642,
+        "96": 2436,
+        "97": 2283,
+        "98": 2134,
+        "99": 2055,
+        "100": 1914,
+        "101": 1877,
+        "102": 1641,
+        "103": 1643,
+        "104": 1537,
+        "105": 1521,
+        "106": 1459,
+        "107": 1329,
+        "108": 1227,
+        "109": 1124,
+        "110": 1085,
+        "111": 1003,
+        "112": 967,
+        "113": 837,
+        "114": 748,
+        "115": 695,
+        "116": 680,
+        "117": 662,
+        "118": 590,
+        "119": 584,
+        "120": 596,
+        "121": 630,
+        "122": 608,
+        "123": 567,
+        "124": 549,
+        "125": 535,
+        "126": 485,
+        "127": 432,
+        "128": 387,
+        "129": 379,
+        "130": 390,
+        "131": 364,
+        "132": 288,
+        "133": 321,
+        "134": 302,
+        "135": 280,
+        "136": 268,
+        "137": 287,
+        "138": 270,
+        "139": 262,
+        "140": 222,
+        "141": 196,
+        "142": 170,
+        "143": 136,
+        "144": 155,
+        "145": 122,
+        "146": 115,
+        "147": 114,
+        "148": 96,
+        "149": 98,
+        "150": 83,
+        "151": 94,
+        "152": 94,
+        "153": 84,
+        "154": 77,
+        "155": 88,
+        "156": 70,
+        "157": 66,
+        "158": 60,
+        "159": 78,
+        "160": 59,
+        "161": 57,
+        "162": 63,
+        "163": 74,
+        "164": 63,
+        "165": 52,
+        "166": 65,
+        "167": 50,
+        "168": 76,
+        "169": 63,
+        "170": 63,
+        "171": 67,
+        "172": 62,
+        "173": 47,
+        "174": 51,
+        "175": 38,
+        "176": 42,
+        "177": 44,
+        "178": 44,
+        "179": 39,
+        "180": 45,
+        "181": 42,
+        "182": 31,
+        "183": 27,
+        "184": 39,
+        "185": 21,
+        "186": 28,
+        "187": 23,
+        "188": 36,
+        "189": 24,
+        "190": 11,
+        "191": 11,
+        "192": 11,
+        "193": 6,
+        "194": 5,
+        "195": 1
+    }
+}

counts/sha_max.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+    "4": {
+        "max": 4,
+        "name": [
+            "007.jpg"
+        ],
+        "x": 324,
+        "y": 176
+    },
+    "7": {
+        "max": 8,
+        "name": [
+            "034.jpg"
+        ],
+        "x": 271,
+        "y": 341
+    },
+    "8": {
+        "max": 10,
+        "name": [
+            "034.jpg"
+        ],
+        "x": 271,
+        "y": 340
+    },
+    "14": {
+        "max": 20,
+        "name": [
+            "120.jpg"
+        ],
+        "x": 295,
+        "y": 762
+    },
+    "16": {
+        "max": 28,
+        "name": [
+            "120.jpg"
+        ],
+        "x": 296,
+        "y": 760
+    },
+    "28": {
+        "max": 55,
+        "name": [
+            "120.jpg"
+        ],
+        "x": 303,
+        "y": 652
+    },
+    "32": {
+        "max": 66,
+        "name": [
+            "120.jpg"
+        ],
+        "x": 313,
+        "y": 651
+    },
+    "56": {
+        "max": 159,
+        "name": [
+            "120.jpg"
+        ],
+        "x": 301,
+        "y": 655
+    },
+    "64": {
+        "max": 195,
+        "name": [
+            "120.jpg"
+        ],
+        "x": 301,
+        "y": 657
+    }
+}

counts/shb.json ADDED Viewed

	@@ -0,0 +1,313 @@

+{
+    "1": {
+        "0": 314523695,
+        "1": 49105
+    },
+    "4": {
+        "0": 311650011,
+        "1": 772635,
+        "2": 3256,
+        "3": 95,
+        "4": 3
+    },
+    "7": {
+        "0": 308004073,
+        "1": 2221020,
+        "2": 55546,
+        "3": 4833,
+        "4": 681,
+        "5": 181,
+        "6": 53,
+        "7": 10,
+        "8": 3
+    },
+    "8": {
+        "0": 306646552,
+        "1": 2818806,
+        "2": 96449,
+        "3": 10596,
+        "4": 1733,
+        "5": 447,
+        "6": 138,
+        "7": 57,
+        "8": 22
+    },
+    "14": {
+        "0": 297434203,
+        "1": 7041868,
+        "2": 636791,
+        "3": 139674,
+        "4": 42083,
+        "5": 15292,
+        "6": 6324,
+        "7": 3122,
+        "8": 1298,
+        "9": 595,
+        "10": 304,
+        "11": 225,
+        "12": 169,
+        "13": 45,
+        "14": 7
+    },
+    "16": {
+        "0": 294072360,
+        "1": 8559657,
+        "2": 922301,
+        "3": 225017,
+        "4": 75708,
+        "5": 29428,
+        "6": 12533,
+        "7": 6347,
+        "8": 3429,
+        "9": 1869,
+        "10": 913,
+        "11": 494,
+        "12": 338,
+        "13": 202,
+        "14": 192,
+        "15": 11,
+        "16": 1
+    },
+    "28": {
+        "0": 272510235,
+        "1": 17410504,
+        "2": 3291284,
+        "3": 1142143,
+        "4": 507297,
+        "5": 259215,
+        "6": 143543,
+        "7": 86057,
+        "8": 52776,
+        "9": 33818,
+        "10": 22305,
+        "11": 14778,
+        "12": 9902,
+        "13": 6909,
+        "14": 4829,
+        "15": 3511,
+        "16": 2765,
+        "17": 2161,
+        "18": 1627,
+        "19": 1396,
+        "20": 1075,
+        "21": 796,
+        "22": 639,
+        "23": 520,
+        "24": 375,
+        "25": 205,
+        "26": 92,
+        "27": 27,
+        "28": 10,
+        "29": 4,
+        "30": 2
+    },
+    "32": {
+        "0": 265135522,
+        "1": 20054326,
+        "2": 4219708,
+        "3": 1561515,
+        "4": 730071,
+        "5": 382477,
+        "6": 224559,
+        "7": 137037,
+        "8": 88156,
+        "9": 58687,
+        "10": 40153,
+        "11": 27989,
+        "12": 19367,
+        "13": 13555,
+        "14": 10126,
+        "15": 7417,
+        "16": 5593,
+        "17": 4242,
+        "18": 3235,
+        "19": 2714,
+        "20": 2136,
+        "21": 1687,
+        "22": 1343,
+        "23": 1093,
+        "24": 990,
+        "25": 881,
+        "26": 651,
+        "27": 428,
+        "28": 278,
+        "29": 173,
+        "30": 116,
+        "31": 83,
+        "32": 43,
+        "33": 36,
+        "34": 8,
+        "35": 3,
+        "36": 2
+    },
+    "56": {
+        "0": 222314024,
+        "1": 32191189,
+        "2": 9727123,
+        "3": 4342794,
+        "4": 2404979,
+        "5": 1505427,
+        "6": 1000917,
+        "7": 701563,
+        "8": 499165,
+        "9": 362489,
+        "10": 267104,
+        "11": 199980,
+        "12": 153876,
+        "13": 123592,
+        "14": 98575,
+        "15": 80346,
+        "16": 63904,
+        "17": 48447,
+        "18": 40380,
+        "19": 33358,
+        "20": 28391,
+        "21": 24691,
+        "22": 21645,
+        "23": 17519,
+        "24": 14226,
+        "25": 11839,
+        "26": 10556,
+        "27": 8884,
+        "28": 7573,
+        "29": 6473,
+        "30": 5818,
+        "31": 4784,
+        "32": 4100,
+        "33": 4039,
+        "34": 3497,
+        "35": 2721,
+        "36": 2238,
+        "37": 2208,
+        "38": 2072,
+        "39": 2096,
+        "40": 1750,
+        "41": 1466,
+        "42": 1404,
+        "43": 1196,
+        "44": 1138,
+        "45": 918,
+        "46": 786,
+        "47": 672,
+        "48": 698,
+        "49": 688,
+        "50": 610,
+        "51": 537,
+        "52": 469,
+        "53": 448,
+        "54": 346,
+        "55": 264,
+        "56": 198,
+        "57": 168,
+        "58": 131,
+        "59": 54,
+        "60": 28,
+        "61": 34,
+        "62": 22,
+        "63": 10,
+        "64": 18,
+        "65": 17,
+        "66": 16,
+        "67": 25,
+        "68": 21,
+        "69": 25,
+        "70": 11,
+        "71": 13,
+        "72": 7,
+        "73": 2,
+        "74": 4,
+        "76": 4
+    },
+    "64": {
+        "0": 209048823,
+        "1": 34905056,
+        "2": 11413735,
+        "3": 5278103,
+        "4": 2980067,
+        "5": 1886714,
+        "6": 1308620,
+        "7": 945805,
+        "8": 684080,
+        "9": 516549,
+        "10": 387772,
+        "11": 301510,
+        "12": 234031,
+        "13": 186750,
+        "14": 149049,
+        "15": 124290,
+        "16": 101853,
+        "17": 81550,
+        "18": 68680,
+        "19": 55441,
+        "20": 45411,
+        "21": 39050,
+        "22": 33804,
+        "23": 30803,
+        "24": 24284,
+        "25": 20547,
+        "26": 17358,
+        "27": 14546,
+        "28": 12847,
+        "29": 11443,
+        "30": 9852,
+        "31": 8715,
+        "32": 7569,
+        "33": 6927,
+        "34": 6284,
+        "35": 5688,
+        "36": 4647,
+        "37": 4476,
+        "38": 3947,
+        "39": 3756,
+        "40": 3232,
+        "41": 2883,
+        "42": 2580,
+        "43": 2338,
+        "44": 2092,
+        "45": 1930,
+        "46": 1670,
+        "47": 1514,
+        "48": 1470,
+        "49": 1361,
+        "50": 1267,
+        "51": 1218,
+        "52": 939,
+        "53": 852,
+        "54": 738,
+        "55": 662,
+        "56": 628,
+        "57": 690,
+        "58": 495,
+        "59": 508,
+        "60": 441,
+        "61": 401,
+        "62": 333,
+        "63": 314,
+        "64": 194,
+        "65": 130,
+        "66": 108,
+        "67": 108,
+        "68": 91,
+        "69": 72,
+        "70": 32,
+        "71": 29,
+        "72": 32,
+        "73": 20,
+        "74": 17,
+        "75": 11,
+        "76": 21,
+        "77": 15,
+        "78": 17,
+        "79": 21,
+        "80": 20,
+        "81": 13,
+        "82": 17,
+        "83": 9,
+        "84": 8,
+        "85": 5,
+        "86": 10,
+        "87": 3,
+        "88": 5,
+        "89": 4
+    }
+}

counts/shb_max.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+    "4": {
+        "max": 4,
+        "name": [
+            "148.jpg"
+        ],
+        "x": 40,
+        "y": 549
+    },
+    "7": {
+        "max": 8,
+        "name": [
+            "200.jpg"
+        ],
+        "x": 275,
+        "y": 37
+    },
+    "8": {
+        "max": 8,
+        "name": [
+            "148.jpg"
+        ],
+        "x": 39,
+        "y": 550
+    },
+    "14": {
+        "max": 14,
+        "name": [
+            "200.jpg"
+        ],
+        "x": 269,
+        "y": 37
+    },
+    "16": {
+        "max": 16,
+        "name": [
+            "191.jpg"
+        ],
+        "x": 1,
+        "y": 257
+    },
+    "28": {
+        "max": 30,
+        "name": [
+            "191.jpg"
+        ],
+        "x": 0,
+        "y": 257
+    },
+    "32": {
+        "max": 36,
+        "name": [
+            "191.jpg"
+        ],
+        "x": 0,
+        "y": 256
+    },
+    "56": {
+        "max": 76,
+        "name": [
+            "191.jpg"
+        ],
+        "x": 0,
+        "y": 256
+    },
+    "64": {
+        "max": 89,
+        "name": [
+            "191.jpg"
+        ],
+        "x": 1,
+        "y": 254
+    }
+}

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .crowd import Crowd, InMemoryCrowd, available_datasets, standardize_dataset_name, NWPUTest, ShanghaiTech
+from .transforms import RandomCrop, Resize, RandomResizedCrop, RandomHorizontalFlip, Resize2Multiple, ZeroPad2Multiple
+from .transforms import ColorJitter, RandomGrayscale, GaussianBlur, RandomApply, PepperSaltNoise
+from .utils import collate_fn
+__all__ = [
+    "Crowd", "InMemoryCrowd", "available_datasets", "standardize_dataset_name", "NWPUTest", "ShanghaiTech",
+    "RandomCrop", "Resize", "RandomResizedCrop", "RandomHorizontalFlip", "Resize2Multiple", "ZeroPad2Multiple",
+    "ColorJitter", "RandomGrayscale", "GaussianBlur", "RandomApply", "PepperSaltNoise",
+    "collate_fn",
+]

datasets/crowd.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import torch
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchvision.transforms import ToTensor, Normalize, Compose
+import os
+from glob import glob
+from tqdm import tqdm
+# from PIL import Image
+from turbojpeg import TurboJPEG, TJPF_RGB
+jpeg_decoder = TurboJPEG()
+import numpy as np
+from typing import Optional, Callable, Union, Tuple
+from .utils import get_id, generate_density_map
+curr_dir = os.path.dirname(os.path.abspath(__file__))
+available_datasets = [
+    "shanghaitech_a", "sha",
+    "shanghaitech_b", "shb",
+    "shanghaitech", "sh",
+    "ucf_qnrf", "qnrf", "ucf-qnrf",
+    "nwpu", "nwpu_crowd", "nwpu-crowd",
+]
+mean = (0.48145466, 0.4578275, 0.40821073)
+std = (0.26862954, 0.26130258, 0.27577711)
+def standardize_dataset_name(dataset: str) -> str:
+    assert dataset.lower() in available_datasets, f"Dataset {dataset} is not available."
+    if dataset.lower() in ["shanghaitech_a", "sha"]:
+        return "sha"
+    elif dataset.lower() in ["shanghaitech_b", "shb"]:
+        return "shb"
+    elif dataset.lower() in ["shanghaitech", "sh"]:
+        return "sh"
+    elif dataset.lower() in ["ucf_qnrf", "qnrf", "ucf-qnrf"]:
+        return "qnrf"
+    else:
+        assert dataset.lower() in ["nwpu", "nwpu_crowd", "nwpu-crowd"], f"Dataset {dataset} is not available."
+        return "nwpu"
+class Crowd(Dataset):
+    def __init__(
+        self,
+        dataset: str,
+        split: str,
+        transforms: Optional[Callable] = None,
+        sigma: Optional[float] = None,
+        return_filename: bool = False,
+        num_crops: int = 1,
+    ) -> None:
+        """
+        Dataset for crowd counting.
+        """
+        assert dataset.lower() in available_datasets, f"Dataset {dataset} is not available."
+        assert dataset.lower() not in ["shanghaitech", "sh"], "For the combined ShanghaiTech dataset, use ShanghaiTech class."
+        assert split in ["train", "val", "test"], f"Split {split} is not available."
+        assert num_crops > 0, f"num_crops should be positive, got {num_crops}."
+        self.dataset = standardize_dataset_name(dataset)
+        self.split = split
+        self.__find_root__()
+        self.__make_dataset__()
+        self.__check_sanity__()
+        self.to_tensor = ToTensor()
+        self.normalize = Normalize(mean=mean, std=std)
+        self.transforms = transforms
+        self.sigma = sigma
+        self.return_filename = return_filename
+        self.num_crops = num_crops
+    def __find_root__(self) -> None:
+        self.root = os.path.join(curr_dir, "..", "data", self.dataset)
+    def __make_dataset__(self) -> None:
+        image_names = glob(os.path.join(self.root, self.split, "images", "*.jpg"))
+        label_names = glob(os.path.join(self.root, self.split, "labels", "*.npy"))
+        image_names = [os.path.basename(image_name) for image_name in image_names]
+        label_names = [os.path.basename(label_name) for label_name in label_names]
+        image_names.sort(key=get_id)
+        label_names.sort(key=get_id)
+        image_ids = tuple([get_id(image_name) for image_name in image_names])
+        label_ids = tuple([get_id(label_name) for label_name in label_names])
+        assert image_ids == label_ids, "image_ids and label_ids do not match."
+        self.image_names = tuple(image_names)
+        self.label_names = tuple(label_names)
+    def __check_sanity__(self) -> None:
+        if self.dataset == "sha":
+            if self.split == "train":
+                assert len(self.image_names) == len(self.label_names) == 300, f"ShanghaiTech_A train split should have 300 images, but found {len(self.image_names)}."
+            else:
+                assert self.split == "val", f"Split {self.split} is not available for dataset {self.dataset}."
+                assert len(self.image_names) == len(self.label_names) == 182, f"ShanghaiTech_A val split should have 182 images, but found {len(self.image_names)}."
+        elif self.dataset == "shb":
+            if self.split == "train":
+                assert len(self.image_names) == len(self.label_names) == 399, f"ShanghaiTech_B train split should have 399 images, but found {len(self.image_names)}."
+            else:
+                assert self.split == "val", f"Split {self.split} is not available for dataset {self.dataset}."
+                assert len(self.image_names) == len(self.label_names) == 316, f"ShanghaiTech_B val split should have 316 images, but found {len(self.image_names)}."
+        elif self.dataset == "nwpu":
+            if self.split == "train":
+                assert len(self.image_names) == len(self.label_names) == 3109, f"NWPU train split should have 3109 images, but found {len(self.image_names)}."
+            else:
+                assert self.split == "val", f"Split {self.split} is not available for dataset {self.dataset}."
+                assert len(self.image_names) == len(self.label_names) == 500, f"NWPU val split should have 500 images, but found {len(self.image_names)}."
+        elif self.dataset == "qnrf":
+            if self.split == "train":
+                assert len(self.image_names) == len(self.label_names) == 1201, f"UCF_QNRF train split should have 1201 images, but found {len(self.image_names)}."
+            else:
+                assert self.split == "val", f"Split {self.split} is not available for dataset {self.dataset}."
+                assert len(self.image_names) == len(self.label_names) == 334, f"UCF_QNRF val split should have 334 images, but found {len(self.image_names)}."
+    def __len__(self) -> int:
+        return len(self.image_names)
+    def __getitem__(self, idx: int) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, str]]:
+        image_name = self.image_names[idx]
+        label_name = self.label_names[idx]
+        image_path = os.path.join(self.root, self.split, "images", image_name)
+        label_path = os.path.join(self.root, self.split, "labels", label_name)
+        with open(image_path, "rb") as f:
+            # image = Image.open(f).convert("RGB")
+            image = jpeg_decoder.decode(f.read(), pixel_format=TJPF_RGB)
+            image = self.to_tensor(image)
+        with open(label_path, "rb") as f:
+            label = np.load(f)
+            label = torch.from_numpy(label).float()
+        if self.transforms is not None:
+            images_labels = [self.transforms(image.clone(), label.clone()) for _ in range(self.num_crops)]
+            images, labels = zip(*images_labels)
+        else:
+            images = [image.clone() for _ in range(self.num_crops)]
+            labels = [label.clone() for _ in range(self.num_crops)]
+        images = [self.normalize(img) for img in images]
+        density_maps = torch.stack([generate_density_map(label, image.shape[-2], image.shape[-1], sigma=self.sigma) for image, label in zip(images, labels)], 0)
+        image_names = [image_name] * len(images)
+        images = torch.stack(images, 0)
+        if self.return_filename:
+            return images, labels, density_maps, image_names
+        else:
+            return images, labels, density_maps
+class InMemoryCrowd(Dataset):
+    def __init__(
+        self,
+        dataset: str,
+        split: str,
+        transforms: Optional[Callable] = None,
+        sigma: Optional[float] = None,
+        return_filename: bool = False,
+        num_crops: int = 1,
+    ) -> None:
+        """
+        Dataset for crowd counting, with images and labels loaded into memory.
+        """
+        crowd = Crowd(
+            dataset=dataset,
+            split=split,
+            transforms=None,
+            sigma=sigma,
+            return_filename=True,
+            num_crops=1,
+        )
+        print(f"Loading {len(crowd)} samples from {dataset} {split} split into memory...")
+        self.images, self.labels, self.image_names = [], [], []
+        self.unnormalize = Compose([
+            Normalize(mean=(0., 0., 0.), std=(1./std[0], 1./std[1], 1./std[2]), inplace=True),
+            Normalize(mean=(-mean[0], -mean[1], -mean[2]), std=(1., 1., 1.), inplace=True)
+        ])
+        for i in tqdm(range(len(crowd)), desc="Loading images and labels into memory"):
+            image, label, _, image_name = crowd[i]
+            self.images.append(self.unnormalize(image[0]))  # recover original image
+            self.labels.append(label[0])
+            self.image_names.append(image_name[0])
+        assert len(self.images) == len(self.labels) == len(self.image_names), "Mismatch in number of images, labels, and image names."
+        self.transforms = transforms
+        self.sigma = sigma
+        self.num_crops = num_crops
+        self.return_filename = return_filename
+        self.normalize = Normalize(mean=mean, std=std, inplace=False)
+    def __len__(self) -> int:
+        return len(self.images)
+    def __getitem__(self, idx: int) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, str]]:
+        image, label, image_name = self.images[idx].clone(), self.labels[idx].clone(), self.image_names[idx]
+        if self.transforms is not None:
+            images_labels = [self.transforms(image.clone(), label.clone()) for _ in range(self.num_crops)]
+            images, labels = zip(*images_labels)
+        else:
+            images = [image.clone() for _ in range(self.num_crops)]
+            labels = [label.clone() for _ in range(self.num_crops)]
+        images = [self.normalize(img) for img in images]
+        density_maps = torch.stack([generate_density_map(label, image.shape[-2], image.shape[-1], sigma=self.sigma) for image, label in zip(images, labels)], 0)
+        image_names = [image_name] * len(images)
+        images = torch.stack(images, 0)
+        if self.return_filename:
+            return images, labels, density_maps, image_names
+        else:
+            return images, labels, density_maps
+class NWPUTest(Dataset):
+    def __init__(
+        self,
+        transforms: Optional[Callable] = None,
+        return_filename: bool = False,
+    ) -> None:
+        """
+        The test set of NWPU-Crowd dataset. The test set is not labeled, so only images are returned.
+        """
+        self.root = os.path.join(curr_dir, "..", "data", "nwpu")
+        image_names = glob(os.path.join(self.root, "test", "images", "*.jpg"))
+        image_names = [os.path.basename(image_name) for image_name in image_names]
+        assert len(image_names) == 1500, f"NWPU test split should have 1500 images, but found {len(image_names)}."
+        image_names.sort(key=get_id)
+        self.image_names = tuple(image_names)
+        self.to_tensor = ToTensor()
+        self.normalize = Normalize(mean=mean, std=std)
+        self.transforms = transforms
+        self.return_filename = return_filename
+    def __len__(self) -> int:
+        return len(self.image_names)
+    def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, str]]:
+        image_name = self.image_names[idx]
+        image_path = os.path.join(self.root, "test", "images", image_name)
+        with open(image_path, "rb") as f:
+            # image = Image.open(f).convert("RGB")
+            image = jpeg_decoder.decode(f.read(), pixel_format=TJPF_RGB)
+            image = self.to_tensor(image)
+        label = torch.tensor([], dtype=torch.float)  # dummy label
+        image, _ = self.transforms(image, label) if self.transforms is not None else (image, label)
+        image = self.normalize(image)
+        if self.return_filename:
+            return image, image_name
+        else:
+            return image
+class ShanghaiTech(Dataset):
+    def __init__(
+        self,
+        split: str,
+        transforms: Optional[Callable] = None,
+        sigma: Optional[float] = None,
+        return_filename: bool = False,
+        num_crops: int = 1,
+    ) -> None:
+        super().__init__()
+        self.sha = Crowd(
+            dataset="sha",
+            split=split,
+            transforms=transforms,
+            sigma=sigma,
+            return_filename=return_filename,
+            num_crops=num_crops,
+        )
+        self.shb = Crowd(
+            dataset="shb",
+            split=split,
+            transforms=transforms,
+            sigma=sigma,
+            return_filename=return_filename,
+            num_crops=num_crops,
+        )
+        self.dataset = "sh"
+        self.split = split
+        self.transforms = transforms
+        self.sigma = sigma
+        self.return_filename = return_filename
+        self.num_crops = num_crops
+    def __len__(self) -> int:
+        return len(self.sha) + len(self.shb)
+    def __getitem__(self, idx: int) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor, Tensor, str]]:
+        if idx < len(self.sha):
+            return self.sha[idx]
+        else:
+            return self.shb[idx - len(self.sha)]

datasets/transforms.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import torch
+from torch import Tensor
+from torchvision.transforms import ColorJitter as _ColorJitter
+import torchvision.transforms.functional as TF
+import numpy as np
+from typing import Tuple, Union, Optional, Callable
+def _crop(
+    image: Tensor,
+    label: Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+) -> Tuple[Tensor, Tensor]:
+    image = TF.crop(image, top, left, height, width)
+    if len(label) > 0:
+        label[:, 0] -= left
+        label[:, 1] -= top
+        label_mask = (label[:, 0] >= 0) & (label[:, 0] < width) & (label[:, 1] >= 0) & (label[:, 1] < height)
+        label = label[label_mask]
+    return image, label
+def _resize(
+    image: Tensor,
+    label: Tensor,
+    height: int,
+    width: int,
+) -> Tuple[Tensor, Tensor]:
+    image_height, image_width = image.shape[-2:]
+    image = TF.resize(image, (height, width), interpolation=TF.InterpolationMode.BICUBIC, antialias=True) if (image_height != height or image_width != width) else image
+    if len(label) > 0 and (image_height != height or image_width != width):
+        label[:, 0] = label[:, 0] * width / image_width
+        label[:, 1] = label[:, 1] * height / image_height
+        label[:, 0] = label[:, 0].clamp(min=0, max=width - 1)
+        label[:, 1] = label[:, 1].clamp(min=0, max=height - 1)
+    return image, label
+class RandomCrop(object):
+    def __init__(self, size: Tuple[int, int]) -> None:
+        self.size = size
+        assert len(self.size) == 2, f"size should be a tuple (h, w), got {self.size}."
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        crop_height, crop_width = self.size
+        image_height, image_width = image.shape[-2:]
+        assert crop_height <= image_height and crop_width <= image_width, \
+            f"crop size should be no larger than image size, got crop size {self.size} and image size {image.shape}."
+        top = torch.randint(0, image_height - crop_height + 1, (1,)).item()
+        left = torch.randint(0, image_width - crop_width + 1, (1,)).item()
+        return _crop(image, label, top, left, crop_height, crop_width)
+class Resize(object):
+    def __init__(self, size: Tuple[int, int]) -> None:
+        self.size = size
+        assert len(self.size) == 2, f"size should be a tuple (h, w), got {self.size}."
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        return _resize(image, label, self.size[0], self.size[1])
+class Resize2Multiple(object):
+    """
+    Resize the image so that it satisfies:
+        img_h = window_h + stride_h * n_h
+        img_w = window_w + stride_w * n_w
+    """
+    def __init__(
+        self,
+        window_size: Tuple[int, int],
+        stride: Tuple[int, int],
+    ) -> None:
+        window_size = (int(window_size), int(window_size)) if isinstance(window_size, (int, float)) else window_size
+        window_size = tuple(window_size)
+        stride = (int(stride), int(stride)) if isinstance(stride, (int, float)) else stride
+        stride = tuple(stride)
+        assert len(window_size) == 2, f"window_size should be a tuple (h, w), got {window_size}."
+        assert len(stride) == 2, f"stride should be a tuple (h, w), got {stride}."
+        assert all(s > 0 for s in window_size), f"window_size should be positive, got {window_size}."
+        assert all(s > 0 for s in stride), f"stride should be positive, got {stride}."
+        assert stride[0] <= window_size[0] and stride[1] <= window_size[1], f"stride should be no larger than window_size, got {stride} and {window_size}."
+        self.window_size = window_size
+        self.stride = stride
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        image_height, image_width = image.shape[-2:]
+        window_height, window_width = self.window_size
+        stride_height, stride_width = self.stride
+        new_height = int(max(round((image_height - window_height) / stride_height), 0) * stride_height + window_height)
+        new_width = int(max(round((image_width - window_width) / stride_width), 0) * stride_width + window_width)
+        if new_height == image_height and new_width == image_width:
+            return image, label
+        else:
+            return _resize(image, label, new_height, new_width)
+class ZeroPad2Multiple(object):
+    def __init__(
+        self,
+        window_size: Tuple[int, int],
+        stride: Tuple[int, int],
+    ) -> None:
+        window_size = (int(window_size), int(window_size)) if isinstance(window_size, (int, float)) else window_size
+        window_size = tuple(window_size)
+        stride = (int(stride), int(stride)) if isinstance(stride, (int, float)) else stride
+        stride = tuple(stride)
+        assert len(window_size) == 2, f"window_size should be a tuple (h, w), got {window_size}."
+        assert len(stride) == 2, f"stride should be a tuple (h, w), got {stride}."
+        assert all(s > 0 for s in window_size), f"window_size should be positive, got {window_size}."
+        assert all(s > 0 for s in stride), f"stride should be positive, got {stride}."
+        assert stride[0] <= window_size[0] and stride[1] <= window_size[1], f"stride should be no larger than window_size, got {stride} and {window_size}."
+        self.window_size = window_size
+        self.stride = stride
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        image_height, image_width = image.shape[-2:]
+        window_height, window_width = self.window_size
+        stride_height, stride_width = self.stride
+        new_height = int(max(np.ceil((image_height - window_height) / stride_height), 0) * stride_height + window_height)
+        new_width = int(max(np.ceil((image_width - window_width) / stride_width), 0) * stride_width + window_width)
+        if new_height == image_height and new_width == image_width:
+            return image, label
+        else:
+            assert new_height >= image_height and new_width >= image_width, f"new size should be no less than the original size, got {new_height} and {new_width}."
+            pad_height, pad_width = new_height - image_height, new_width - image_width
+            return TF.pad(image, (0, 0, pad_width, pad_height), fill=0), label  # only pad the right and bottom sides so that the label coordinates are not affected
+class RandomResizedCrop(object):
+    def __init__(
+        self,
+        size: Tuple[int, int],
+        scale: Tuple[float, float] = (0.75, 1.25),
+    ) -> None:
+        """
+        Randomly crop an image and resize it to a given size. The aspect ratio is preserved during this process.
+        """
+        self.size = size
+        self.scale = scale
+        assert len(self.size) == 2, f"size should be a tuple (h, w), got {self.size}."
+        assert 0 < self.scale[0] <= self.scale[1], f"scale should satisfy 0 < scale[0] <= scale[1], got {self.scale}."
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        out_height, out_width = self.size
+        # out_ratio = out_width / out_height
+        scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()  # if scale < 1, then the image will be zoomed in, otherwise zoomed out
+        in_height, in_width = image.shape[-2:]
+        # if in_width / in_height < out_ratio:  # Image is too tall
+        #     crop_width = int(in_width * scale)
+        #     crop_height = int(crop_width / out_ratio)
+        # else:  # Image is too wide
+        #     crop_height = int(in_height * scale)
+        #     crop_width = int(crop_height * out_ratio)
+        crop_height, crop_width = int(out_height * scale), int(out_width * scale)
+        if crop_height <= in_height and crop_width <= in_width:  # directly crop and resize the image
+            top = torch.randint(0, in_height - crop_height + 1, (1,)).item()
+            left = torch.randint(0, in_width - crop_width + 1, (1,)).item()
+        else:  # resize the image and then crop
+            ratio = max(crop_height / in_height, crop_width / in_width)  # keep the aspect ratio
+            resize_height, resize_width = int(in_height * ratio) + 1, int(in_width * ratio) + 1  # add 1 to make sure the resized image is no less than the crop size
+            image, label = _resize(image, label, resize_height, resize_width)
+            top = torch.randint(0, resize_height - crop_height + 1, (1,)).item()
+            left = torch.randint(0, resize_width - crop_width + 1, (1,)).item()
+        image, label = _crop(image, label, top, left, crop_height, crop_width)
+        return _resize(image, label, out_height, out_width)
+class RandomHorizontalFlip(object):
+    def __init__(self, p: float = 0.5) -> None:
+        self.p = p
+        assert 0 <= self.p <= 1, f"p should be in range [0, 1], got {self.p}."
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        if torch.rand(1) < self.p:
+            image = TF.hflip(image)
+            if len(label) > 0:
+                label[:, 0] = image.shape[-1] - 1 - label[:, 0]  # if width is 256, then 0 -> 255, 1 -> 254, 2 -> 253, etc.
+                label[:, 0] = label[:, 0].clamp(min=0, max=image.shape[-1] - 1)
+        return image, label
+class ColorJitter(object):
+    def __init__(
+        self,
+        brightness: Union[float, Tuple[float, float]] = 0.4,
+        contrast: Union[float, Tuple[float, float]] = 0.4,
+        saturation: Union[float, Tuple[float, float]] = 0.4,
+        hue: Union[float, Tuple[float, float]] = 0.2,
+    ) -> None:
+        self.color_jitter = _ColorJitter(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        return self.color_jitter(image), label
+class RandomGrayscale(object):
+    def __init__(self, p: float = 0.1) -> None:
+        self.p = p
+        assert 0 <= self.p <= 1, f"p should be in range [0, 1], got {self.p}."
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        if torch.rand(1) < self.p:
+            image = TF.rgb_to_grayscale(image, num_output_channels=3)
+        return image, label
+class GaussianBlur(object):
+    def __init__(self, kernel_size: int, sigma: Tuple[float, float] = (0.1, 2.0)) -> None:
+        self.kernel_size = kernel_size
+        self.sigma = sigma
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        return TF.gaussian_blur(image, self.kernel_size, self.sigma), label
+class RandomApply(object):
+    def __init__(self, transforms: Tuple[Callable, ...], p: Union[float, Tuple[float, ...]] = 0.5) -> None:
+        self.transforms = transforms
+        p = [p] * len(transforms) if isinstance(p, float) else p
+        assert all(0 <= p_ <= 1 for p_ in p), f"p should be in range [0, 1], got {p}."
+        assert len(p) == len(transforms), f"p should be a float or a tuple of floats with the same length as transforms, got {p}."
+        self.p = p
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        for transform, p in zip(self.transforms, self.p):
+            if torch.rand(1) < p:
+                image, label = transform(image, label)
+        return image, label
+class PepperSaltNoise(object):
+    def __init__(self, saltiness: float = 0.001, spiciness: float = 0.001) -> None:
+        self.saltiness = saltiness
+        self.spiciness = spiciness
+        assert 0 <= self.saltiness <= 1, f"saltiness should be in range [0, 1], got {self.saltiness}."
+        assert 0 <= self.spiciness <= 1, f"spiciness should be in range [0, 1], got {self.spiciness}."
+    def __call__(self, image: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
+        noise = torch.rand_like(image)
+        image = torch.where(noise < self.saltiness, 1., image)  # Salt
+        image = torch.where(noise > 1 - self.spiciness, 0., image)    # Pepper
+        return image, label

datasets/utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+from torch import Tensor
+from scipy.ndimage import gaussian_filter
+from typing import Optional, List, Tuple
+def get_id(x: str) -> int:
+    return int(x.split(".")[0])
+def generate_density_map(label: Tensor, height: int, width: int, sigma: Optional[float] = None) -> Tensor:
+    """
+    Generate the density map based on the dot annotations provided by the label.
+    """
+    density_map = torch.zeros((1, height, width), dtype=torch.float32)
+    if len(label) > 0:
+        assert len(label.shape) == 2 and label.shape[1] == 2, f"label should be a Nx2 tensor, got {label.shape}."
+        label_ = label.long()
+        label_[:, 0] = label_[:, 0].clamp(min=0, max=width - 1)
+        label_[:, 1] = label_[:, 1].clamp(min=0, max=height - 1)
+        density_map[0, label_[:, 1], label_[:, 0]] = 1.0
+    if sigma is not None:
+        assert sigma > 0, f"sigma should be positive if not None, got {sigma}."
+        density_map = torch.from_numpy(gaussian_filter(density_map, sigma=sigma))
+    return density_map
+def collate_fn(batch: List[Tensor]) -> Tuple[Tensor, List[Tensor], Tensor]:
+    batch = list(zip(*batch))
+    images = batch[0]
+    assert len(images[0].shape) == 4, f"images should be a 4D tensor, got {images[0].shape}."
+    if len(batch) == 4:  # image, label, density_map, image_name
+        images = torch.cat(images, 0)
+        points = batch[1]  # list of lists of tensors, flatten it
+        points = [p for points_ in points for p in points_]
+        densities = torch.cat(batch[2], 0)
+        image_names = batch[3]  # list of lists of strings, flatten it
+        image_names = [name for names_ in image_names for name in names_]
+        return images, points, densities, image_names
+    elif len(batch) == 3:  # image, label, density_map
+        images = torch.cat(images, 0)
+        points = batch[1]
+        points = [p for points_ in points for p in points_]
+        densities = torch.cat(batch[2], 0)
+        return images, points, densities
+    elif len(batch) == 2:  # image, image_name. NWPU test dataset
+        images = torch.cat(images, 0)
+        image_names = batch[1]
+        image_names = [name for names_ in image_names for name in names_]
+        return images, image_names
+    else:
+        images = torch.cat(images, 0)
+        return images

efficiency.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from argparse import ArgumentParser
+import time
+import os
+import torch
+import torchvision.transforms as transforms
+from contextlib import nullcontext
+import json
+from models import get_model
+parser = ArgumentParser(description="Train an EBC model.")
+parser.add_argument("--model_info_path", type=str, required=True, help="Path to the model information file.")
+parser.add_argument("--batch_size", type=int, default=1, help="Batch size for the model.")
+parser.add_argument("--height", type=int, default=768, help="Height of the input image.")
+parser.add_argument("--width", type=int, default=1024, help="Width of the input image.")
+parser.add_argument("--num_iterations", type=int, default=200, help="Number of iterations to run the model.")
+parser.add_argument("--num_warmup", type=int, default=20, help="Dispose of the first N iterations.")
+parser.add_argument("--device", type=str, choices=["cpu", "cuda", "mps"], help="Device to run the model on. Options are 'cpu', 'cuda', or 'mps'.")
+parser.add_argument("--amp", action="store_true", help="Enable autocast mixed precision (fp16/bf16).")
+parser.add_argument("--half", action="store_true", help="Use half precision for the model.")
+parser.add_argument("--channels_last", action="store_true", help="Use NHWC memory format (recommended for CUDA).")
+parser.add_argument("--compile", action="store_true", help="Enable torch.compile if available.")
+parser.add_argument("--threads", type=int, default=None, help="torch.set_num_threads(threads) for CPU")
+parser.add_argument("--sleep_time", type=float, default=0.0, help="Seconds to sleep after *each* iteration (cool-down).")
+_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+def _dummy_input(bs, h, w, device, half, channels_last):
+    x = torch.rand(bs, 3, h, w, device=device)
+    x = _normalize(x)
+    if half:
+        x = x.half()
+    if channels_last:
+        x = x.to(memory_format=torch.channels_last)
+    return x
+def _maybe_sync(dev):
+    if dev.type == "cuda":
+        torch.cuda.synchronize()
+@torch.inference_mode()
+def benchmark(
+    model: torch.nn.Module,
+    inp: torch.Tensor,
+    warmup: int,
+    steps: int,
+    amp: bool,
+    sleep_time: float = 0.0
+):
+    cm = torch.autocast(device_type=inp.device.type) if amp else nullcontext()
+    # --- warm-up ---
+    for _ in range(warmup):
+        with cm:
+            _ = model(inp)
+    _maybe_sync(inp.device)
+    # --- timed loop ---
+    total_time = 0.0
+    for _ in range(steps):
+        tic = time.perf_counter()
+        with cm:
+            _ = model(inp)
+        toc = time.perf_counter()
+        total_time += toc - tic
+        if sleep_time > 0:
+            time.sleep(sleep_time)
+    _maybe_sync(inp.device)
+    fps = steps / total_time
+    return fps, total_time / steps
+def main(args):
+    assert os.path.isfile(args.model_info_path), \
+        f"{args.model_info_path} not found"
+    model = get_model(model_info_path=args.model_info_path)
+    model.eval()
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+    if args.half:
+        model = model.half()
+    device = torch.device(args.device)
+    model = model.to(device)
+    if args.compile and hasattr(torch, "compile"):
+        model = torch.compile(model, mode="reduce-overhead")
+    if args.threads:
+        torch.set_num_threads(args.threads)
+        torch.set_num_interop_threads(1)
+    inp = _dummy_input(
+        args.batch_size,
+        args.height,
+        args.width,
+        device,
+        args.half,
+        args.channels_last
+    )
+    fps, t_avg = benchmark(
+        model,
+        inp,
+        warmup=args.num_warmup,
+        steps=args.num_iterations,
+        amp=args.amp,
+        sleep_time=args.sleep_time
+    )
+    cfg = vars(args)
+    cfg.pop("model_info_path")
+    print(json.dumps(cfg, indent=2))
+    print(f"\nAverage latency: {t_avg*1000:6.2f} ms  |  FPS: {fps:,.2f}")
+if __name__ == "__main__":
+    main(parser.parse_args())
+# CUDA @FP16 + channels_last + torch.compile
+# python efficiency.py \
+#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
+#   --device cuda --half --amp --channels_last --compile
+# CUDA @AMP + channels_last + torch.compile
+# python efficiency.py \
+#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
+#   --device cuda --amp --channels_last --compile
+# CUDA @FP32 + channels_last + torch.compile
+# python efficiency.py \
+#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
+#   --device cuda --channels_last --compile
+# AMD 5900X (12 Core) + channels_last + torch.compile
+# export OMP_NUM_THREADS=12; export MKL_NUM_THREADS=12
+# python efficiency.py \
+#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
+#   --device cpu --threads 12 --channels_last --compile
+# Apple M1 Pro (6 Performance Cores). Compiling makes it slower.
+# export OMP_NUM_THREADS=6; export VECLIB_MAXIMUM_THREADS=6
+# python efficiency.py \
+#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
+#   --device cpu --threads 6
+# Apple M1 Pro MPS @FP32 + torch.compile
+# python efficiency.py \
+#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
+#   --device mps --channels_last --compile

evaluate.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+from torch.amp import autocast
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch import nn, Tensor
+from torch.utils.data import DataLoader
+from typing import Tuple, Optional
+from tqdm import tqdm
+import numpy as np
+from utils import sliding_window_predict, barrier, calculate_errors
+def evaluate(
+    model: nn.Module,
+    data_loader: DataLoader,
+    sliding_window: bool,
+    max_input_size: int = 4096,
+    window_size: int = 224,
+    stride: int = 224,
+    max_num_windows: int = 64,
+    device: torch.device = torch.device("cuda"),
+    amp: bool = False,
+    local_rank: int = 0,
+    nprocs: int = 1,
+    progress_bar: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    ddp = nprocs > 1
+    model = model.to(device)
+    model.eval()
+    pred_counts, gt_counts = [], []
+    data_iter = tqdm(data_loader) if (local_rank == 0 and progress_bar) else data_loader
+    for image, gt_points, _ in data_iter:
+        image = image.to(device)
+        image_height, image_width = image.shape[-2:]
+        gt_counts.extend([len(p) for p in gt_points])
+        # Resize image if it's smaller than the window size
+        aspect_ratio = image_width / image_height
+        if image_height < window_size:
+            new_height = window_size
+            new_width = int(new_height * aspect_ratio)
+            image = F.interpolate(image, size=(new_height, new_width), mode="bicubic", align_corners=False)
+            image_height, image_width = new_height, new_width
+        if image_width < window_size:
+            new_width = window_size
+            new_height = int(new_width / aspect_ratio)
+            image = F.interpolate(image, size=(new_height, new_width), mode="bicubic", align_corners=False)
+            image_height, image_width = new_height, new_width
+        with torch.set_grad_enabled(False), autocast(device_type="cuda", enabled=amp):
+            if sliding_window or (image_height * image_width) > max_input_size ** 2:
+                pred_den_maps = sliding_window_predict(model, image, window_size, stride, max_num_windows)
+            else:
+                pred_den_maps = model(image)
+            pred_counts.extend(pred_den_maps.sum(dim=(-1, -2, -3)).cpu().numpy().tolist())
+    barrier(ddp)
+    assert len(pred_counts) == len(gt_counts), f"Length of predictions and ground truths should be equal, but got {len(pred_counts)} and {len(gt_counts)}"
+    if ddp:
+        pred_counts, gt_counts = torch.tensor(pred_counts, device=device), torch.tensor(gt_counts, device=device)
+        # Pad `pred_counts` and `gt_counts` to the same length across all processes.
+        local_length = torch.tensor([len(pred_counts)], device=device)
+        lengths = [torch.zeros_like(local_length) for _ in range(nprocs)]
+        dist.all_gather(lengths, local_length)
+        max_length = max([l.item() for l in lengths])
+        padded_pred_counts, padded_gt_counts = torch.full((max_length,), float("nan"), device=device), torch.full((max_length,), float("nan"), device=device)
+        padded_pred_counts[:len(pred_counts)], padded_gt_counts[:len(gt_counts)] = pred_counts, gt_counts
+        gathered_pred_counts, gathered_gt_counts = [torch.zeros_like(padded_pred_counts) for _ in range(nprocs)], [torch.zeros_like(padded_gt_counts) for _ in range(nprocs)]
+        dist.all_gather(gathered_pred_counts, padded_pred_counts)
+        dist.all_gather(gathered_gt_counts, padded_gt_counts)
+        # Concatenate predictions and ground truths from all processes and remove padding (nan values).
+        pred_counts, gt_counts = torch.cat(gathered_pred_counts).cpu(), torch.cat(gathered_gt_counts).cpu()
+        pred_counts, gt_counts = pred_counts[~torch.isnan(pred_counts)], gt_counts[~torch.isnan(gt_counts)]
+        pred_counts, gt_counts = pred_counts.numpy(), gt_counts.numpy()
+    else:
+        pred_counts, gt_counts = np.array(pred_counts), np.array(gt_counts)
+    torch.cuda.empty_cache()
+    return calculate_errors(pred_counts, gt_counts)

losses/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .loss import QuadLoss
+from .bregman_pytorch import sinkhorn
+__all__ = [
+    "QuadLoss",
+    "sinkhorn",
+]

losses/bregman_pytorch.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Code modified from https://github.com/cvlab-stonybrook/DM-Count/blob/master/losses/bregman_pytorch.py
+import torch
+from torch.amp import autocast
+from torch import Tensor
+from typing import Union, Tuple, Dict
+M_EPS = 1e-16
+@torch.no_grad()
+@autocast(device_type="cuda", enabled=True, dtype=torch.float32)
+def sinkhorn(
+    a: Tensor,
+    b: Tensor,
+    C: Tensor,
+    reg: float = 1e-1,
+    maxIter: int = 1000,
+    stopThr: float = 1e-9,
+    verbose: bool = False,
+    log: bool = True,
+    eval_freq: int = 10,
+    print_freq: int = 200,
+) -> Union[Tensor, Tuple[Tensor, Dict[str, Tensor]]]:
+    device = a.device
+    na, nb = C.shape
+    assert na == a.shape[0] and nb == b.shape[0], f"Shapes of a ({a.shape}) or b ({b.shape}) do not match that of C ({C.shape})"
+    assert reg > 0, f"reg should be greater than 0. Found reg = {reg}"
+    assert a.min() >= 0. and b.min() >= 0., f"Elements in a and b should be nonnegative. Found a.min() = {a.min()}, b.min() = {b.min()}"
+    if log:
+        log = {"err": []}
+    u = torch.ones(na, dtype=a.dtype, device=device) / na
+    v = torch.ones(nb, dtype=b.dtype, device=device) / nb
+    K = torch.exp(-C / reg)
+    it, err = 1, 1
+    while (err > stopThr and it <= maxIter):
+        u_pre, v_pre = u.clone(), v.clone()
+        KTu = torch.matmul(K.T, u)
+        v = b / (KTu + M_EPS)
+        Kv = torch.matmul(K, v)
+        u = a / (Kv + M_EPS)
+        if torch.any(torch.isnan(u)) or torch.any(torch.isnan(v)) or torch.any(torch.isinf(u)) or torch.any(torch.isinf(v)):
+            print("Warning: numerical errors at iteration", it)
+            u, v = u_pre, v_pre
+            break
+        if log and it % eval_freq == 0:
+            b_hat = torch.matmul(u, K) * v
+            err = (b - b_hat).pow(2).sum().item()
+            log["err"].append(err)
+        if verbose and it % print_freq == 0:
+            print(f"Iteration {it}, constraint error {err}")
+        it += 1
+    if log:
+        log["u"] = u
+        log["v"] = v
+        log["alpha"] = reg * torch.log(u + M_EPS)
+        log["beta"] = reg * torch.log(v + M_EPS)
+    P = u.view(-1, 1) * K * v.view(1, -1)
+    if log:
+        return P, log
+    else:
+        return P

losses/dm_loss.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import torch
+from torch import nn, Tensor
+from torch.amp import autocast
+from typing import List, Tuple, Dict
+from .bregman_pytorch import sinkhorn
+from .utils import _reshape_density
+EPS = 1e-8
+class OTLoss(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        block_size: int,
+        numItermax: int = 100,
+        regularization: float = 10.0
+    ) -> None:
+        super().__init__()
+        assert input_size % block_size == 0
+        self.input_size = input_size
+        self.block_size = block_size
+        self.num_blocks_h = input_size // block_size
+        self.num_blocks_w = input_size // block_size
+        self.numItermax = numItermax
+        self.regularization = regularization
+        # coordinate is same to image space, set to constant since crop size is same
+        self.coords_h = torch.arange(0, input_size, step=block_size, dtype=torch.float32) + block_size / 2
+        self.coords_w = torch.arange(0, input_size, step=block_size, dtype=torch.float32) + block_size / 2
+        self.coords_h, self.coords_w = self.coords_h.unsqueeze(0), self.coords_w.unsqueeze(0)  # [1, #coordinates]
+    def set_numItermax(self, numItermax: int) -> None:
+        self.numItermax = numItermax
+    @autocast(device_type="cuda", enabled=True, dtype=torch.float32)  # avoid numerical instability
+    def forward(self, pred_den_map: Tensor, pred_den_map_normed: Tensor, gt_points: List[Tensor]) -> Tuple[Tensor, Tensor, Tensor]:
+        assert pred_den_map.shape[1:] == pred_den_map_normed.shape[1:] == (1, self.num_blocks_h, self.num_blocks_w), f"Expected pred_den_map to have shape (B, 1, {self.num_blocks_h}, {self.num_blocks_w}), but got {pred_den_map.shape} and {pred_den_map_normed.shape}"
+        assert len(gt_points) == pred_den_map.shape[0] == pred_den_map_normed.shape[0], f"Expected gt_points to have length {pred_den_map_normed.shape[0]}, but got {len(gt_points)}"
+        device = pred_den_map.device
+        loss = torch.zeros(1, device=device)
+        ot_obj_values = torch.zeros(1, device=device)
+        w_dist = torch.zeros(1, device=device)  # Wasserstein distance
+        coords_h, coords_w = self.coords_h.to(device), self.coords_w.to(device)  # [1, #coordinates]
+        for idx, points in enumerate(gt_points):
+            if len(points) > 0:
+                # compute l2 square distance, it should be source target distance. [#gt, #coordinates * #coordinates]
+                x, y = points[:, 0].unsqueeze(1), points[:, 1].unsqueeze(1)  # [#gt, 1]
+                x_dist = -2 * torch.matmul(x, coords_w) + x * x + coords_w * coords_w  # [#gt, #coordinates]
+                y_dist = -2 * torch.matmul(y, coords_h) + y * y + coords_h * coords_h  # [#gt, #coordinates]
+                dist = x_dist.unsqueeze(1) + y_dist.unsqueeze(2)
+                dist = dist.view((dist.shape[0], -1)) # size of [#gt, #coordinates * #coordinates]
+                source_prob = pred_den_map_normed[idx].view(-1).detach()
+                target_prob = (torch.ones(len(points)) / len(points)).to(device)
+                # use sinkhorn to solve OT, compute optimal beta.
+                P, log = sinkhorn(
+                    a=target_prob,
+                    b=source_prob,
+                    C=dist,
+                    reg=self.regularization,
+                    maxIter=self.numItermax,
+                    log=True
+                )
+                beta = log["beta"] # size is the same as source_prob: [#coordinates * #coordinates]
+                w_dist += (dist * P).sum()
+                ot_obj_values += (pred_den_map_normed[idx] * beta.view(1, self.num_blocks_h, self.num_blocks_w)).sum()
+                # compute the gradient of OT loss to predicted density (pred_den_map).
+                # im_grad = beta / source_count - < beta, source_density> / (source_count)^2
+                source_density = pred_den_map[idx].view(-1).detach()
+                source_count = source_density.sum()
+                gradient_1 = (source_count) / (source_count * source_count+ EPS) * beta # size of [#coordinates * #coordinates]
+                gradient_2 = (source_density * beta).sum() / (source_count * source_count + EPS) # size of 1
+                gradient = gradient_1 - gradient_2
+                gradient = gradient.detach().view(1, self.num_blocks_h, self.num_blocks_w)
+                # Define loss = <im_grad, predicted density>. The gradient of loss w.r.t predicted density is im_grad.
+                loss += torch.sum(pred_den_map[idx] * gradient)
+        return loss, w_dist, ot_obj_values
+class DMLoss(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        block_size: int,
+        numItermax: int = 100,
+        regularization: float = 10.0,
+        weight_ot: float = 0.1,
+        weight_tv: float = 0.01,
+        weight_cnt: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.input_size = input_size
+        self.block_size = block_size
+        self.weight_ot = weight_ot
+        self.weight_tv = weight_tv
+        self.weight_cnt = weight_cnt
+        self.ot_loss = OTLoss(
+            input_size=self.input_size,
+            block_size=self.block_size,
+            numItermax=numItermax,
+            regularization=regularization,
+        )
+        self.tv_loss = nn.L1Loss(reduction="none")
+        self.cnt_loss = nn.L1Loss(reduction="mean")
+        self.weight_ot = weight_ot
+        self.weight_tv = weight_tv
+    @autocast(device_type="cuda", enabled=True, dtype=torch.float32)  # avoid numerical instability
+    def forward(self, pred_den_map: Tensor, gt_den_map: Tensor, gt_points: List[Tensor]) -> Tuple[Tensor, Dict[str, Tensor]]:
+        gt_den_map = _reshape_density(gt_den_map, block_size=self.ot_loss.block_size) if gt_den_map.shape[-2:] != pred_den_map.shape[-2:] else gt_den_map
+        assert pred_den_map.shape == gt_den_map.shape, f"Expected pred_den_map and gt_den_map to have the same shape, got {pred_den_map.shape} and {gt_den_map.shape}"
+        pred_cnt = pred_den_map.view(pred_den_map.shape[0], -1).sum(dim=1)
+        pred_den_map_normed = pred_den_map / (pred_cnt.view(-1, 1, 1, 1) + EPS)
+        gt_cnt = torch.tensor([len(p) for p in gt_points], dtype=torch.float32).to(pred_den_map.device)
+        gt_den_map_normed = gt_den_map / (gt_cnt.view(-1, 1, 1, 1) + EPS)
+        ot_loss, w_dist, _ = self.ot_loss(pred_den_map, pred_den_map_normed, gt_points)
+        tv_loss = (self.tv_loss(pred_den_map_normed, gt_den_map_normed).sum(dim=(1, 2, 3)) * gt_cnt).mean() if self.weight_tv > 0 else 0
+        cnt_loss = self.cnt_loss(pred_cnt, gt_cnt) if self.weight_cnt > 0 else 0
+        loss = ot_loss * self.weight_ot + tv_loss * self.weight_tv + cnt_loss * self.weight_cnt
+        loss_info = {
+            "ot_loss": ot_loss.detach(),
+            "dm_loss": loss.detach(),
+            "w_dist": w_dist.detach(),
+        }
+        if self.weight_tv > 0:
+            loss_info["tv_loss"] = tv_loss.detach()
+        if self.weight_cnt > 0:
+            loss_info["cnt_loss"] = cnt_loss.detach()
+        return loss, loss_info

losses/dual_loss.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from typing import List, Tuple, Dict
+from .dm_loss import DMLoss
+from .multiscale_mae import MultiscaleMAE
+from .utils import _reshape_density
+class DualLoss(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        block_size: int,
+        bins: List[Tuple[float, float]],
+        bin_centers: List[float],
+        cls_loss: str = "ce",
+        reg_loss: str = "dm",
+        weight_tv: float = 0.01,
+        weight_cls: float = 0.1,
+        weight_reg: float = 0.1,
+        numItermax: int = 100,
+        regularization: float = 10.0,
+        scales: List[int] = [1, 2, 4],
+        min_scale_weight: float = 0.25,
+        max_scale_weight: float = 0.75,
+        alpha: float = 0.5,
+    ) -> None:
+        super().__init__()
+        assert len(bins) == len(bin_centers) >= 2, f"Expected bins and bin_centers to have at least 2 elements, got {len(bins)} and {len(bin_centers)}"
+        assert all([len(b) == 2 for b in bins]), f"Expected all bins to be of length 2, got {bins}"
+        assert all(b[0] <= p <= b[1] for b, p in zip(bins, bin_centers)), f"Expected bin_centers to be within the range of the corresponding bin, got {bins} and {bin_centers}"
+        assert cls_loss in ["ce", "mae", "mse", "none"], f"Expected cls_loss to be one of ['ce', 'mae', 'mse', 'none'], got {cls_loss}"
+        assert reg_loss in ["dm", "msmae", "mae", "mse", "none"], f"Expected reg_loss to be one of ['dm', 'msmae', 'mae', 'mse', 'none'], got {reg_loss}"
+        assert not (cls_loss == "none" and reg_loss == "none"), "Expected at least one of cls_loss and reg_loss to be provided"
+        assert weight_cls is None or weight_cls >= 0, f"Expected weight_cls to be non-negative, got {weight_cls}"
+        assert weight_reg is None or weight_reg >= 0, f"Expected weight_reg to be non-negative, got {weight_reg}"
+        assert weight_tv is None or weight_tv >= 0, f"Expected weight_tv to be non-negative, got {weight_tv}"
+        assert min_scale_weight is None or max_scale_weight is None or max_scale_weight >= min_scale_weight > 0, f"Expected max_scale_weight to be greater than or equal to min_scale_weight, got {min_scale_weight} and {max_scale_weight}"
+        assert alpha is None or 1 > alpha > 0, f"Expected alpha to be between 0 and 1, got {alpha}"
+        if reg_loss == "dm":
+            assert numItermax is not None and numItermax > 0, f"Expected numItermax to be a positive integer, got {numItermax}"
+            assert regularization is not None and regularization > 0, f"Expected regularization to be a positive float, got {regularization}"
+            assert weight_tv is not None and weight_tv >= 0, f"Expected weight_tv to be non-negative, got {weight_tv}"
+        else:
+            weight_tv, numItermax, regularization = None, None, None
+        if reg_loss == "msmae":
+            assert isinstance(scales, (list, tuple)) and len(scales) > 0 and all(isinstance(s, int) and s > 0 for s in scales), f"Expected scales to be a list of positive integers, got {scales}"
+            assert max_scale_weight >= min_scale_weight > 0, f"Expected max_scale_weight to be greater than or equal to min_scale_weight, got {min_scale_weight} and {max_scale_weight}"
+            assert 1 > alpha > 0, f"Expected alpha to be between 0 and 1, got {alpha}"
+        else:
+            scales = None
+            min_scale_weight, max_scale_weight = None, None
+            alpha = None
+        weight_cls = weight_cls if weight_cls is not None else 0
+        weight_reg = weight_reg if weight_reg is not None else 0
+        self.input_size, self.block_size = input_size, block_size
+        self.num_blocks_h, self.num_blocks_w = input_size // block_size, input_size // block_size
+        self.bins, self.bin_centers, self.num_bins = bins, bin_centers, len(bins)
+        self.cls_loss, self.reg_loss = cls_loss, reg_loss
+        self.weight_cls, self.weight_reg = weight_cls, weight_reg
+        self.numItermax, self.regularization = numItermax, regularization
+        self.weight_tv = weight_tv
+        self.scales = scales
+        self.min_scale_weight, self.max_scale_weight = min_scale_weight, max_scale_weight
+        if cls_loss == "ce":
+            self.cls_loss_fn = nn.CrossEntropyLoss(reduction="none")
+            self.weight_cls = 1.0
+        elif cls_loss == "mae":
+            self.cls_loss_fn = nn.L1Loss(reduction="none")
+            self.weight_cls = weight_cls
+        elif cls_loss == "mse":
+            self.cls_loss_fn = nn.MSELoss(reduction="none")
+            self.weight_cls = weight_cls
+        else:  # cls_loss == "none"
+            self.cls_loss_fn = None
+            self.weight_cls = 0
+        if reg_loss == "dm":
+            self.reg_loss_fn = DMLoss(
+                input_size=input_size,
+                block_size=block_size,
+                numItermax=numItermax,
+                regularization=regularization,
+                weight_ot=weight_reg,
+                weight_tv=weight_tv,
+                weight_cnt=0,  # Calculate the count loss separately
+            )
+            self.weight_reg = 1.0
+        elif reg_loss == "msmae":
+            self.reg_loss_fn = MultiscaleMAE(scales=scales, weights=None, min_scale_weight=min_scale_weight, max_scale_weight=max_scale_weight, alpha=alpha)
+            self.weight_reg = 1.0
+        elif reg_loss == "mae":
+            self.reg_loss_fn = nn.L1Loss(reduction="none")
+            self.weight_reg = weight_reg
+        elif reg_loss == "mse":
+            self.reg_loss_fn = nn.MSELoss(reduction="none")
+            self.weight_reg = weight_reg
+        else:
+            self.reg_loss_fn = None
+            self.weight_reg = 0
+        self.cnt_loss_fn = nn.L1Loss(reduction="none")
+    def _bin_count(self, density_map: Tensor) -> Tensor:
+        class_map = torch.zeros_like(density_map, dtype=torch.long)
+        for idx, (low, high) in enumerate(self.bins):
+            mask = (density_map >= low) & (density_map <= high)
+            class_map[mask] = idx
+        return class_map.squeeze(1)  # remove channel dimension
+    def forward(
+        self,
+        pred_logit_map: Tensor,
+        pred_den_map: Tensor,
+        gt_den_map: Tensor,
+        gt_points: List[Tensor]
+    ) -> Tuple[Tensor, Dict[str, Tensor]]:
+        B = pred_logit_map.shape[0]
+        assert pred_logit_map.shape == (B, self.num_bins, self.num_blocks_h, self.num_blocks_w), f"Expected pred_logit_map to have shape {B, self.num_bins, self.num_blocks_h, self.num_blocks_w}, got {pred_logit_map.shape}"
+        if gt_den_map.shape[-2:] != (self.num_blocks_h, self.num_blocks_w):
+            assert gt_den_map.shape[-2:] == (self.input_size, self.input_size), f"Expected gt_den_map to have shape {B, 1, self.input_size, self.input_size}, got {gt_den_map.shape}"
+            gt_den_map = _reshape_density(gt_den_map, block_size=self.block_size)
+        assert pred_den_map.shape == gt_den_map.shape == (B, 1, self.num_blocks_h, self.num_blocks_w), f"Expected pred_den_map and gt_den_map to have shape (B, 1, H, W), got {pred_den_map.shape} and {gt_den_map.shape}"
+        assert len(gt_points) == B, f"Expected gt_points to have length B, got {len(gt_points)}"
+        loss_info = {}
+        if self.weight_cls > 0:
+            gt_class_map = self._bin_count(gt_den_map)
+            if self.cls_loss == "ce":
+                cls_loss = self.cls_loss_fn(pred_logit_map, gt_class_map).sum(dim=(-1, -2)).mean()
+                loss_info["cls_ce_loss"] = cls_loss.detach()
+            else:  # self.cls_loss in ["mae", "mse"]
+                gt_prob_map = F.one_hot(gt_class_map, num_classes=self.num_bins).float()  # B, H, W -> B, H, W, N
+                gt_prob_map = gt_prob_map.permute(0, 3, 1, 2)  # B, H, W, N -> B, N, H, W
+                pred_prob_map = pred_logit_map.softmax(dim=1)
+                cls_loss = self.cls_loss_fn(pred_prob_map, gt_prob_map).sum(dim=(-1, -2)).mean()
+                loss_info[f"cls_{self.cls_loss}_loss"] = cls_loss.detach()
+        else:
+            cls_loss = 0
+        if self.weight_reg > 0:
+            if self.reg_loss == "dm":
+                reg_loss, reg_loss_info = self.reg_loss_fn(
+                    pred_den_map=pred_den_map,
+                    gt_den_map=gt_den_map,
+                    gt_points=gt_points,
+                )
+                loss_info.update({f"reg_{k}": v for k, v in reg_loss_info.items()})
+            elif self.reg_loss == "msmae":
+                reg_loss, reg_loss_info = self.reg_loss_fn(pred_den_map, gt_den_map)
+                loss_info.update({f"reg_{k}": v for k, v in reg_loss_info.items()})
+            else:  # self.reg_loss in ["mae", "mse"]
+                reg_loss = self.reg_loss_fn(pred_den_map, gt_den_map).sum(dim=(-1, -2)).mean()
+                loss_info[f"reg_{self.reg_loss}_loss"] = reg_loss.detach()
+        else:
+            reg_loss = 0
+        gt_cnt = torch.tensor([len(p) for p in gt_points], dtype=torch.float32, device=pred_den_map.device)
+        cnt_loss = self.cnt_loss_fn(pred_den_map.sum(dim=(1, 2, 3)), gt_cnt).mean()
+        loss_info["cnt_loss"] = cnt_loss.detach()
+        total_loss = self.weight_cls * cls_loss + self.weight_reg * reg_loss + cnt_loss
+        loss_info["total_loss"] = total_loss.detach()
+        loss_info = dict(sorted(loss_info.items()))  # sort by key for nicer printing
+        return total_loss, loss_info

losses/loss.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from typing import List, Dict, Optional, Tuple, Union
+from .dm_loss import DMLoss
+from .multiscale_mae import MultiscaleMAE
+from .poisson_nll import PoissonNLL
+from .zero_inflated_poisson_nll import ZIPoissonNLL, ZICrossEntropy
+from .utils import _reshape_density, _bin_count
+EPS = 1e-8
+class QuadLoss(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        block_size: int,
+        bins: List[Tuple[float, float]],
+        reg_loss: str = "zipnll",
+        aux_loss: str = "none",
+        weight_cls: float = 1.0,
+        weight_reg: float = 1.0,
+        weight_aux: Optional[float] = None,
+        numItermax: Optional[int] = 100,
+        regularization: Optional[int] = 10.0,
+        scales: Optional[List[int]] = [[1, 2, 4]],
+        min_scale_weight: Optional[float] = 0.0,
+        max_scale_weight: Optional[float] = 1.0,
+        alpha: Optional[float] = 0.5,
+    ) -> None:
+        super().__init__()
+        assert input_size % block_size == 0, f"Expected input_size to be divisible by block_size, got {input_size} and {block_size}"
+        assert len(bins) >= 2, f"Expected bins to have at least 2 elements, got {len(bins)}"
+        assert all([len(b) == 2 for b in bins]), f"Expected all bins to be of length 2, got {bins}"
+        bins = [(float(low), float(high)) for low, high in bins]
+        assert all([b[0] <= b[1] for b in bins]), f"Expected each bin to have bin[0] <= bin[1], got {bins}"
+        assert reg_loss in ["zipnll", "pnll", "dm", "msmae", "mae", "mse"], f"Expected reg_loss to be one of ['zipnll', 'pnll', 'dm', 'msmae', 'mae', 'mse'], got {reg_loss}"
+        assert aux_loss in ["zipnll", "pnll", "dm", "msmae", "mae", "mse", "none"], f"Expected aux_loss to be one of ['zipnll', 'pnll', 'dm', 'msmae', 'mae', 'mse', 'none'], got {aux_loss}"
+        assert weight_cls >= 0, f"Expected weight_cls to be non-negative, got {weight_cls}"
+        assert weight_reg >= 0, f"Expected weight_reg to be non-negative, got {weight_reg}"
+        assert not (weight_cls == 0 and weight_reg == 0), "Expected at least one of weight_cls and weight_reg to be non-zero"
+        weight_aux = 0 if aux_loss == "none" or weight_aux is None else weight_aux
+        assert weight_aux >= 0, f"Expected weight_aux to be non-negative, got {weight_aux}"
+        self.input_size = input_size
+        self.block_size = block_size
+        self.bins = bins
+        self.reg_loss = reg_loss
+        self.aux_loss = aux_loss
+        self.weight_cls = weight_cls
+        self.weight_reg = weight_reg
+        self.weight_aux = weight_aux
+        self.num_bins = len(bins)
+        self.num_blocks_h = input_size // block_size
+        self.num_blocks_w = input_size // block_size
+        if reg_loss == "zipnll":
+            self.cls_loss = "zice"
+            self.cls_loss_fn = ZICrossEntropy(bins=bins, reduction="mean")
+            self.reg_loss_fn = ZIPoissonNLL(reduction="mean")
+        else:
+            self.cls_loss = "ce"
+            self.cls_loss_fn = nn.CrossEntropyLoss(reduction="none")
+            if reg_loss == "pnll":
+                self.reg_loss_fn = PoissonNLL(reduction="mean")
+            elif reg_loss == "dm":
+                assert numItermax is not None and numItermax > 0, f"Expected numItermax to be a positive integer, got {numItermax}"
+                assert regularization is not None and regularization > 0, f"Expected regularization to be a positive float, got {regularization}"
+                self.reg_loss_fn = DMLoss(
+                    input_size=input_size,
+                    block_size=block_size,
+                    numItermax=numItermax,
+                    regularization=regularization,
+                    weight_ot=0.1,
+                    weight_tv=0.01,
+                    weight_cnt=0,  # count loss will be calculated separately in this module.
+                )
+            elif reg_loss == "msmae":
+                assert isinstance(scales, (list, tuple)) and len(scales) > 0 and all(isinstance(s, int) and s > 0 for s in scales), f"Expected scales to be a list of positive integers, got {scales}"
+                assert max_scale_weight >= min_scale_weight >= 0, f"Expected max_scale_weight to be greater than or equal to min_scale_weight, got {min_scale_weight} and {max_scale_weight}"
+                assert 1 > alpha > 0, f"Expected alpha to be between 0 and 1, got {alpha}"
+                self.reg_loss_fn = MultiscaleMAE(
+                    scales=sorted(scales),
+                    min_scale_weight=min_scale_weight,
+                    max_scale_weight=max_scale_weight,
+                    alpha=alpha,
+                )
+            elif reg_loss == "mae":
+                self.reg_loss_fn = nn.L1Loss(reduction="none")
+            elif reg_loss == "mse":
+                self.reg_loss_fn = nn.MSELoss(reduction="none")
+            else:  # reg_loss == "none"
+                self.reg_loss_fn = None
+        if aux_loss == "zipnll":
+            self.aux_loss_fn = ZIPoissonNLL(reduction="mean")
+        elif aux_loss == "pnll":
+            self.aux_loss_fn = PoissonNLL(reduction="mean")
+        elif aux_loss == "dm":
+            assert numItermax is not None and numItermax > 0, f"Expected numItermax to be a positive integer, got {numItermax}"
+            assert regularization is not None and regularization > 0, f"Expected regularization to be a positive float, got {regularization}"
+            self.aux_loss_fn = DMLoss(
+                input_size=input_size,
+                block_size=block_size,
+                numItermax=numItermax,
+                regularization=regularization,
+                weight_ot=0.1,
+                weight_tv=0.01,
+                weight_cnt=0,  # count loss will be calculated separately in this module.
+            )
+        elif aux_loss == "msmae":
+            assert isinstance(scales, (list, tuple)) and len(scales) > 0 and all(isinstance(s, int) and s > 0 for s in scales), f"Expected scales to be a list of positive integers, got {scales}"
+            assert max_scale_weight >= min_scale_weight >= 0, f"Expected max_scale_weight to be greater than or equal to min_scale_weight, got {min_scale_weight} and {max_scale_weight}"
+            assert 1 > alpha > 0, f"Expected alpha to be between 0 and 1, got {alpha}"
+            self.aux_loss_fn = MultiscaleMAE(
+                scales=sorted(scales),
+                min_scale_weight=min_scale_weight,
+                max_scale_weight=max_scale_weight,
+                alpha=alpha,
+            )
+        elif aux_loss == "mae":
+            self.aux_loss_fn = nn.L1Loss(reduction="none")
+        elif aux_loss == "mse":
+            self.aux_loss_fn = nn.MSELoss(reduction="none")
+        else:  # aux_loss == "none"
+            self.aux_loss_fn = None
+        self.cnt_loss_fn = nn.L1Loss(reduction="mean")
+    def forward(
+        self,
+        pred_logit_map: Tensor,
+        pred_den_map: Tensor,
+        gt_den_map: Tensor,
+        gt_points: List[Tensor],
+        pred_logit_pi_map: Optional[Tensor] = None,
+        pred_lambda_map: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Dict[str, Tensor]]:
+        B = pred_den_map.shape[0]
+        assert pred_logit_map.shape[-2:] == (self.num_blocks_h, self.num_blocks_w), f"Expected pred_logit_map to have the spatial dimension of {self.num_blocks_h}x{self.num_blocks_w}, got {pred_logit_map.shape}"
+        if gt_den_map.shape[-2:] != (self.num_blocks_h, self.num_blocks_w):
+            assert gt_den_map.shape[-2:] == (self.input_size, self.input_size), f"Expected gt_den_map to have shape {B, 1, self.input_size, self.input_size}, got {gt_den_map.shape}"
+            gt_den_map = _reshape_density(gt_den_map, block_size=self.block_size)
+        assert pred_den_map.shape == gt_den_map.shape == (B, 1, self.num_blocks_h, self.num_blocks_w), f"Expected pred_den_map and gt_den_map to have shape (B, 1, H, W), got {pred_den_map.shape} and {gt_den_map.shape}"
+        assert len(gt_points) == B, f"Expected gt_points to have length B, got {len(gt_points)}"
+        if self.reg_loss == "zipnll" or self.aux_loss == "zipnll":
+            assert pred_logit_pi_map is not None and pred_logit_pi_map.shape == (B, 2, self.num_blocks_h, self.num_blocks_w), f"Expected pred_logit_pi_map to have shape {B, 2, self.num_blocks_h, self.num_blocks_w}, got {pred_logit_pi_map.shape}"
+            assert pred_lambda_map is not None and pred_lambda_map.shape == (B, 1, self.num_blocks_h, self.num_blocks_w), f"Expected pred_lambda_map to have shape {B, 1, self.num_blocks_h, self.num_blocks_w}, got {pred_lambda_map.shape}"
+        loss_info = {}
+        if self.weight_cls > 0:
+            gt_class_map = _bin_count(gt_den_map, bins=self.bins)
+            if self.cls_loss == "ce":
+                cls_loss = self.cls_loss_fn(pred_logit_map, gt_class_map).sum(dim=(-1, -2)).mean()
+                loss_info["cls_ce_loss"] = cls_loss.detach()
+            else:  # cls_loss == "zice"
+                cls_loss, cls_loss_info = self.cls_loss_fn(pred_logit_map, gt_den_map)
+                loss_info.update(cls_loss_info)
+        else:
+            cls_loss = 0
+        if self.weight_reg > 0:
+            if self.reg_loss == "zipnll":
+                reg_loss, reg_loss_info = self.reg_loss_fn(pred_logit_pi_map, pred_lambda_map, gt_den_map)
+            elif self.reg_loss == "dm":
+                reg_loss, reg_loss_info = self.reg_loss_fn(pred_den_map, gt_den_map, gt_points)
+            elif self.reg_loss in ["pnll", "msmae"]:
+                reg_loss, reg_loss_info = self.reg_loss_fn(pred_den_map, gt_den_map)
+            else:  # reg_loss in ["mae", "mse"]
+                reg_loss = self.reg_loss_fn(pred_den_map, gt_den_map).sum(dim=(-1, -2)).mean()
+                reg_loss_info = {f"{self.reg_loss}": reg_loss.detach()}
+            reg_loss_info = {f"reg_{k}": v for k, v in reg_loss_info.items()}
+            loss_info.update(reg_loss_info)
+        else:
+            reg_loss = 0
+        if self.weight_aux > 0:
+            if self.aux_loss == "zipnll":
+                aux_loss, aux_loss_info = self.aux_loss_fn(pred_logit_pi_map, pred_lambda_map, gt_den_map)
+            elif self.aux_loss in ["pnll", "msmae"]:
+                aux_loss, aux_loss_info = self.aux_loss_fn(pred_den_map, gt_den_map)
+            elif self.aux_loss == "dm":
+                aux_loss, aux_loss_info = self.aux_loss_fn(pred_den_map, gt_den_map, gt_points)
+            else:
+                aux_loss = self.aux_loss_fn(pred_den_map, gt_den_map).sum(dim=(-1, -2)).mean()
+                aux_loss_info = {f"{self.aux_loss}": aux_loss.detach()}
+            aux_loss_info = {f"aux_{k}": v for k, v in aux_loss_info.items()}
+            loss_info.update(aux_loss_info)
+        else:
+            aux_loss = 0
+        gt_cnt = torch.tensor([len(p) for p in gt_points], dtype=torch.float32, device=pred_den_map.device)
+        cnt_loss = self.cnt_loss_fn(pred_den_map.sum(dim=(1, 2, 3)), gt_cnt)
+        loss_info["cnt_loss"] = cnt_loss.detach()
+        total_loss = self.weight_cls * cls_loss + self.weight_reg * reg_loss + self.weight_aux * aux_loss + cnt_loss
+        return total_loss, loss_info

losses/multiscale_mae.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from torch import nn, Tensor
+import math
+from typing import List, Optional, Dict, Tuple
+class MultiscaleMAE(nn.Module):
+    def __init__(
+        self,
+        scales: List[int] = [1, 2, 4],
+        min_scale_weight: float = 0.0,
+        max_scale_weight: float = 1.0,
+        alpha: float = 0.5,
+        weights: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        assert isinstance(scales, (list, tuple)) and len(scales) > 0 and all(isinstance(s, int) and s > 0 for s in scales), f"Expected scales to be a list of positive integers, got {scales}"
+        assert max_scale_weight >= min_scale_weight >= 0, f"Expected max_scale_weight to be greater than or equal to min_scale_weight, got {min_scale_weight} and {max_scale_weight}"
+        assert 1 > alpha > 0, f"Expected alpha to be between 0 and 1, got {alpha}"
+        self.min_scale_weight, self.max_scale_weight = min_scale_weight, max_scale_weight
+        scales = sorted(scales)  # sort scales in ascending order so that the last one is the largest
+        weights = [min_scale_weight + (max_scale_weight - min_scale_weight) * alpha ** (math.log2(scales[-1] / s)) for s in scales] if weights is None else weights  # e.g., [1, 2, 4, 8] -> [0.125, 0.25, 0.5, 1]
+        assert len(scales) == len(weights), f"Expected scales and weights to have the same length, got {len(scales)} and {len(weights)}"
+        self.scales, self.weights = scales, weights
+        for idx in range(len(scales)):
+            setattr(self, f"pool_{scales[idx]}", nn.AvgPool2d(kernel_size=scales[idx], stride=scales[idx]) if scales[idx] > 1 else nn.Identity())
+            setattr(self, f"weight_{scales[idx]}", weights[idx])
+            setattr(self, f"mae_loss_fn_{scales[idx]}", nn.L1Loss(reduction="none"))
+    def forward(
+        self,
+        pred_den_map: Tensor,
+        gt_den_map: Tensor,
+    ) -> Tuple[Tensor, Dict]:
+        assert len(pred_den_map.shape) == 4, f"Expected pred_den_map to have 4 dimensions, got {len(pred_den_map.shape)}"
+        assert len(gt_den_map.shape) == 4, f"Expected gt_den_map to have 4 dimensions, got {len(gt_den_map.shape)}"
+        assert pred_den_map.shape[1] == gt_den_map.shape[1] == 1, f"Expected pred_den_map and gt_den_map to have 1 channel, got {pred_den_map.shape[1]} and {gt_den_map.shape[1]}"
+        assert pred_den_map.shape == gt_den_map.shape, f"Expected pred_den_map and gt_den_map to have the same shape, got {pred_den_map.shape} and {gt_den_map.shape}"
+        loss, loss_info = 0, {}
+        for idx in range(len(self.scales)):
+            pool = getattr(self, f"pool_{self.scales[idx]}")
+            weight = getattr(self, f"weight_{self.scales[idx]}")
+            loss_fn = getattr(self, f"mae_loss_fn_{self.scales[idx]}")
+            pred_den_map_pool = pool(pred_den_map)
+            gt_den_map_pool = pool(gt_den_map)
+            mae_loss_scale = loss_fn(pred_den_map_pool, gt_den_map_pool).sum(dim=(-1, -2)).mean()
+            loss += weight * mae_loss_scale
+            loss_info[f"mae_loss_{self.scales[idx]}"] = mae_loss_scale.detach()
+        return loss, loss_info

losses/poisson_nll.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+from torch import nn, Tensor
+from .utils import _reshape_density
+EPS = 1e-8
+class PoissonNLL(nn.Module):
+    def __init__(
+        self,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__()
+        assert reduction in ["none", "mean", "sum"], f"Expected reduction to be one of ['none', 'mean', 'sum'], got {reduction}."
+        self.reduction = reduction
+    def forward(self, pred_den_map: Tensor, gt_den_map: Tensor) -> Tensor:
+        """
+        Args:
+            pred_den_map: predicted λ map, shape (B, 1, H, W)
+            gt_den_map: ground truth density map, shape (B, 1, H, W)
+        Returns:
+            Poisson loss
+        """
+        assert len(pred_den_map.shape) == 4, f"Expected pred_den_map to have 4 dimensions, got {len(pred_den_map.shape)}"
+        assert len(gt_den_map.shape) == 4, f"Expected gt_den_map to have 4 dimensions, got {len(gt_den_map.shape)}"
+        assert pred_den_map.shape[1] == gt_den_map.shape[1] == 1, f"Expected pred_den_map and gt_den_map to have 1 channel, got {pred_den_map.shape[1]} and {gt_den_map.shape[1]}"
+        if gt_den_map.shape != pred_den_map.shape:
+            gt_h, gt_w = gt_den_map.shape[-2], gt_den_map.shape[-1]
+            pred_h, pred_w = pred_den_map.shape[-2], pred_den_map.shape[-1]
+            assert gt_h % pred_h == 0 and gt_w % pred_w == 0 and gt_h // pred_h == gt_w // pred_w, f"Expected the spatial dimension of gt_den_map to be a multiple of that of pred_den_map, got {gt_den_map.shape} and {pred_den_map.shape}"
+            gt_den_map = _reshape_density(gt_den_map, block_size=gt_h // pred_h)
+        assert gt_den_map.shape == pred_den_map.shape, f"Expected gt_den_map and pred_den_map to have the same shape, got {gt_den_map.shape} and {pred_den_map.shape}"
+        gt_den_map = gt_den_map.to(pred_den_map.device)
+        loss = (pred_den_map - gt_den_map * torch.log(pred_den_map + EPS)).sum(dim=(-1, -2))  # sum over H and W
+        if self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+        return loss, {"pnll": loss.detach()}

losses/utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+from torch import Tensor
+from typing import List, Tuple
+def _reshape_density(density: Tensor, block_size: int) -> Tensor:
+    assert len(density.shape) == 4, f"Expected 4D (B, 1, H, W) tensor, got {density.shape}"
+    assert density.shape[1] == 1, f"Expected 1 channel, got {density.shape[1]}"
+    assert density.shape[2] % block_size == 0, f"Expected height to be divisible by {block_size}, got {density.shape[2]}"
+    assert density.shape[3] % block_size == 0, f"Expected width to be divisible by {block_size}, got {density.shape[3]}"
+    return density.reshape(density.shape[0], 1, density.shape[2] // block_size, block_size, density.shape[3] // block_size, block_size).sum(dim=(-1, -3))
+def _bin_count(density_map: Tensor, bins: List[Tuple[int, int]]) -> Tensor:
+    class_map = torch.zeros_like(density_map, dtype=torch.long)
+    for idx, (low, high) in enumerate(bins):
+        mask = (density_map >= low) & (density_map <= high)
+        class_map[mask] = idx
+    return class_map.squeeze(1)  # remove channel dimension

losses/zero_inflated_poisson_nll.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+from torch import nn, Tensor
+from einops import rearrange
+from typing import List, Tuple
+from .utils import _reshape_density, _bin_count
+EPS = 1e-8
+class ZIPoissonNLL(nn.Module):
+    def __init__(
+        self,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__()
+        assert reduction in ["none", "mean", "sum"], f"Expected reduction to be one of ['none', 'mean', 'sum'], got {reduction}."
+        self.reduction = reduction
+    def forward(
+        self,
+        logit_pi_maps: Tensor,
+        lambda_maps: Tensor,
+        gt_den_maps: Tensor,
+    ) -> Tensor:
+        assert len(logit_pi_maps.shape) == len(lambda_maps.shape) == len(gt_den_maps.shape) == 4, f"Expected 4D (B, C, H, W) tensor, got {logit_pi_maps.shape}, {lambda_maps.shape}, and {gt_den_maps.shape}"
+        B, _, H, W = lambda_maps.shape
+        assert logit_pi_maps.shape == (B, 2, H, W), f"Expected logit_pi_maps to have shape (B, 2, H, W), got {logit_pi_maps.shape}"
+        assert lambda_maps.shape == (B, 1, H, W), f"Expected lambda_maps to have shape (B, 1, H, W), got {lambda_maps.shape}"
+        if gt_den_maps.shape[2:] != (H, W):
+            gt_h, gt_w = gt_den_maps.shape[-2], gt_den_maps.shape[-1]
+            assert gt_h % H == 0 and gt_w % W == 0 and gt_h // H == gt_w // W, f"Expected the spatial dimension of gt_den_maps to be a multiple of that of lambda_maps, got {gt_den_maps.shape} and {lambda_maps.shape}"
+            gt_den_maps = _reshape_density(gt_den_maps, block_size=gt_h // H)
+        assert gt_den_maps.shape == (B, 1, H, W), f"Expected gt_den_maps to have shape (B, 1, H, W), got {gt_den_maps.shape}"
+        pi_maps = logit_pi_maps.softmax(dim=1)
+        zero_indices = (gt_den_maps == 0).float()
+        zero_loss = -torch.log(pi_maps[:, 0:1] + pi_maps[:, 1:] * torch.exp(-lambda_maps) + EPS) * zero_indices
+        poisson_log_p = gt_den_maps * torch.log(lambda_maps + EPS) - lambda_maps
+        nonzero_loss = (-torch.log(pi_maps[:, 1:] + EPS) - poisson_log_p) * (1.0 - zero_indices)
+        loss = (zero_loss + nonzero_loss).sum(dim=(-1, -2))
+        if self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+        return loss, {"zipnll": loss.detach()}
+class ZICrossEntropy(nn.Module):
+    def __init__(
+        self,
+        bins: List[Tuple[int, int]],
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__()
+        assert all([low <= high for low, high in bins]), f"Expected bins to be a list of tuples (low, high) where low <= high, got {bins}"
+        assert reduction in ["mean", "sum"], f"Expected reduction to be one of ['none', 'mean', 'sum'], got {reduction}."
+        self.bins = bins
+        self.reduction = reduction
+        self.ce_loss_fn = nn.CrossEntropyLoss(reduction="none")
+    def forward(
+        self,
+        logit_maps: Tensor,
+        gt_den_maps: Tensor,
+    ) -> Tensor:
+        assert len(logit_maps.shape) == len(gt_den_maps.shape) == 4, f"Expected 4D (B, C, H, W) tensor, got {logit_maps.shape} and {gt_den_maps.shape}"
+        B, _, H, W = logit_maps.shape
+        assert logit_maps.shape[0] == B and logit_maps.shape[2:] == (H, W), f"Expected logit_maps to have shape (B, C, H, W), got {logit_maps.shape}"
+        if gt_den_maps.shape[2:] != (H, W):
+            gt_h, gt_w = gt_den_maps.shape[-2], gt_den_maps.shape[-1]
+            assert gt_h % H == 0 and gt_w % W == 0 and gt_h // H == gt_w // W, f"Expected the spatial dimension of gt_den_maps to be a multiple of that of logit_maps, got {gt_den_maps.shape} and {logit_maps.shape}"
+            gt_den_maps = _reshape_density(gt_den_maps, block_size=gt_h // H)
+        assert gt_den_maps.shape == (B, 1, H, W), f"Expected gt_den_maps to have shape (B, 1, H, W), got {gt_den_maps.shape}"
+        gt_class_maps = _bin_count(gt_den_maps, bins=self.bins)
+        gt_class_maps = rearrange(gt_class_maps, "B H W -> B (H W)")  # flatten spatial dimensions
+        logit_maps = rearrange(logit_maps, "B C H W -> B (H W) C")  # flatten spatial dimensions
+        loss = 0.0
+        for idx in range(gt_class_maps.shape[0]):
+            gt_class_map, logit_map = gt_class_maps[idx], logit_maps[idx]
+            mask = gt_class_map > 0
+            # Find gt_class_map values and logit_maps values where gt_class_map > 0
+            gt_class_map = gt_class_map[mask] - 1
+            logit_map = logit_map[mask]
+            loss += self.ce_loss_fn(logit_map, gt_class_map).sum()
+        if self.reduction == "mean":
+            loss /= gt_class_maps.shape[0]
+        return loss, {"cls_zice": loss.detach()}

models/__init__.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os, torch
+from typing import List, Tuple, Optional, Union, Dict
+from .ebc import _ebc, EBC
+from .clip_ebc import _clip_ebc, CLIP_EBC
+def get_model(
+    model_info_path: str,
+    model_name: Optional[str] = None,
+    block_size: Optional[int] = None,
+    bins: Optional[List[Tuple[float, float]]] = None,
+    bin_centers: Optional[List[float]] = None,
+    zero_inflated: Optional[bool] = True,
+    # parameters for CLIP_EBC
+    clip_weight_name: Optional[str] = None,
+    num_vpt: Optional[int] = None,
+    vpt_drop: Optional[float] = None,
+    input_size: Optional[int] = None,
+    adapter: bool = False,
+    adapter_reduction: Optional[int] = None,
+    lora: bool = False,
+    lora_rank: Optional[int] = None,
+    lora_alpha: Optional[int] = None,
+    lora_dropout: Optional[float] = None,
+    norm: str = "none",
+    act: str = "none",
+    text_prompts: Optional[List[str]] = None
+) -> Union[EBC, CLIP_EBC]:
+    if os.path.exists(model_info_path):
+        model_info = torch.load(model_info_path, map_location="cpu", weights_only=False)
+        model_name = model_info["config"]["model_name"]
+        block_size = model_info["config"]["block_size"]
+        bins = model_info["config"]["bins"]
+        bin_centers = model_info["config"]["bin_centers"]
+        zero_inflated = model_info["config"]["zero_inflated"]
+        clip_weight_name = model_info["config"].get("clip_weight_name", None)
+        num_vpt = model_info["config"].get("num_vpt", None)
+        vpt_drop = model_info["config"].get("vpt_drop", None)
+        adapter = model_info["config"].get("adapter", False)
+        adapter_reduction = model_info["config"].get("adapter_reduction", None)
+        lora = model_info["config"].get("lora", False)
+        lora_rank = model_info["config"].get("lora_rank", None)
+        lora_alpha = model_info["config"].get("lora_alpha", None)
+        lora_dropout = model_info["config"].get("lora_dropout", None)
+        input_size = model_info["config"].get("input_size", None)
+        text_prompts = model_info["config"].get("text_prompts", None)
+        norm = model_info["config"].get("norm", "none")
+        act = model_info["config"].get("act", "none")
+        weights = model_info["weights"]
+    else:
+        assert model_name is not None, "model_name should be provided if model_info_path is not provided"
+        assert block_size is not None, "block_size should be provided"
+        assert bins is not None, "bins should be provided"
+        assert bin_centers is not None, "bin_centers should be provided"
+        weights = None
+    if "ViT" in model_name:
+        assert num_vpt is not None, f"num_vpt should be provided for ViT models, got {num_vpt}"
+        assert vpt_drop is not None, f"vpt_drop should be provided for ViT models, got {vpt_drop}"
+    if model_name.startswith("CLIP_") or model_name.startswith("CLIP-"):
+        assert clip_weight_name is not None, f"clip_weight_name should be provided for CLIP models, got {clip_weight_name}"
+        model = _clip_ebc(
+            model_name=model_name[5:],
+            weight_name=clip_weight_name,
+            block_size=block_size,
+            bins=bins,
+            bin_centers=bin_centers,
+            zero_inflated=zero_inflated,
+            num_vpt=num_vpt,
+            vpt_drop=vpt_drop,
+            input_size=input_size,
+            adapter=adapter,
+            adapter_reduction=adapter_reduction,
+            lora=lora,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            text_prompts=text_prompts,
+            norm=norm,
+            act=act
+        )
+        model_config = {
+            "model_name": model_name,
+            "block_size": block_size,
+            "bins": bins,
+            "bin_centers": bin_centers,
+            "zero_inflated": zero_inflated,
+            "clip_weight_name": clip_weight_name,
+            "num_vpt": num_vpt,
+            "vpt_drop": vpt_drop,
+            "input_size": input_size,
+            "adapter": adapter,
+            "adapter_reduction": adapter_reduction,
+            "lora": lora,
+            "lora_rank": lora_rank,
+            "lora_alpha": lora_alpha,
+            "lora_dropout": lora_dropout,
+            "text_prompts": model.text_prompts,
+            "norm": norm,
+            "act": act
+        }
+    else:
+        assert not adapter, "adapter for non-CLIP models is not implemented yet"
+        assert not lora, "lora for non-CLIP models is not implemented yet"
+        model = _ebc(
+            model_name=model_name,
+            block_size=block_size,
+            bins=bins,
+            bin_centers=bin_centers,
+            zero_inflated=zero_inflated,
+            num_vpt=num_vpt,
+            vpt_drop=vpt_drop,
+            input_size=input_size,
+            norm=norm,
+            act=act
+        )
+        model_config = {
+            "model_name": model_name,
+            "block_size": block_size,
+            "bins": bins,
+            "bin_centers": bin_centers,
+            "zero_inflated": zero_inflated,
+            "num_vpt": num_vpt,
+            "vpt_drop": vpt_drop,
+            "input_size": input_size,
+            "norm": norm,
+            "act": act
+        }
+    model.config = model_config
+    model_info = {"config": model_config, "weights": weights}
+    if weights is not None:
+        model.load_state_dict(weights)
+    if not os.path.exists(model_info_path):
+        torch.save(model_info, model_info_path)
+    return model
+__all__ = ["get_model"]

models/clip_ebc/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .model import CLIP_EBC, _clip_ebc
+__all__ = [
+    "CLIP_EBC",
+    "_clip_ebc",
+]

models/clip_ebc/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (264 Bytes). View file

models/clip_ebc/__pycache__/convnext.cpython-312.pyc ADDED Viewed

Binary file (8.36 kB). View file

models/clip_ebc/__pycache__/mobileclip.cpython-312.pyc ADDED Viewed

Binary file (7.82 kB). View file

models/clip_ebc/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (12.8 kB). View file

models/clip_ebc/__pycache__/resnet.cpython-312.pyc ADDED Viewed

Binary file (9.74 kB). View file

models/clip_ebc/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (9.93 kB). View file

models/clip_ebc/__pycache__/vit.cpython-312.pyc ADDED Viewed

Binary file (16.7 kB). View file

models/clip_ebc/__pycache__/vit_siglip.cpython-312.pyc ADDED Viewed

Binary file (13.5 kB). View file

models/clip_ebc/convnext.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from torch import nn, Tensor
+import open_clip
+from peft import get_peft_model, LoraConfig
+from ..utils import ConvRefine, ConvAdapter
+from ..utils import ConvUpsample, _get_norm_layer, _get_activation
+convnext_names_and_weights = {
+    "convnext_base": ["laion400m_s13b_b51k"],  # 107.49M
+    "convnext_base_w": ["laion2b_s13b_b82k", "laion2b_s13b_b82k_augreg", "laion_aesthetic_s13b_b82k"],  # 107.75M
+    "convnext_base_w_320": ["laion_aesthetic_s13b_b82k", "laion_aesthetic_s13b_b82k_augreg"],  # 107.75M
+    "convnext_large_d": ["laion2b_s26b_b102k_augreg"],  # 217.46M
+    "convnext_large_d_320": ["laion2b_s29b_b131k_ft", "laion2b_s29b_b131k_ft_soup"],  # 217.46M
+    "convnext_xxlarge": ["laion2b_s34b_b82k_augreg", "laion2b_s34b_b82k_augreg_rewind", "laion2b_s34b_b82k_augreg_soup"]  # 896.88M
+}
+refiner_channels = {
+    "convnext_base": 1024,
+    "convnext_base_w": 1024,
+    "convnext_base_w_320": 1024,
+    "convnext_large_d": 1536,
+    "convnext_large_d_320": 1536,
+    "convnext_xxlarge": 3072,
+}
+refiner_groups = {
+    "convnext_base": 1,
+    "convnext_base_w": 1,
+    "convnext_base_w_320": 1,
+    "convnext_large_d": refiner_channels["convnext_large_d"] // 512,  # 3
+    "convnext_large_d_320": refiner_channels["convnext_large_d_320"] // 512,  # 3
+    "convnext_xxlarge": refiner_channels["convnext_xxlarge"] // 512,  # 6
+}
+class ConvNeXt(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super(ConvNeXt, self).__init__()
+        assert model_name in convnext_names_and_weights, f"Model name should be one of {list(convnext_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in convnext_names_and_weights[model_name], f"Pretrained should be one of {convnext_names_and_weights[model_name]}, but got {weight_name}."
+        assert block_size in [32, 16, 8], f"block_size should be one of [32, 16, 8], got {block_size}"
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        self.adapter = adapter
+        if adapter:
+            self.adapter_reduction = adapter_reduction
+            for param in model.parameters():
+                param.requires_grad = False
+        self.stem = model.trunk.stem
+        self.depth = len(model.trunk.stages)
+        for idx, stage in enumerate(model.trunk.stages):
+            setattr(self, f"stage{idx}", stage)
+            if adapter:
+                setattr(self, f"adapter{idx}", ConvAdapter(
+                    in_channels=stage.blocks[-1].mlp.fc2.out_features,
+                    bottleneck_channels=stage.blocks[-1].mlp.fc2.out_features // adapter_reduction,
+                ) if idx < self.depth - 1 else nn.Identity())  # No adapter for the last stage
+        if self.model_name in ["convnext_base", "convnext_base_w", "convnext_base_w_320", "convnext_xxlarge"]:
+            self.in_features, self.out_features = model.head.proj.in_features, model.head.proj.out_features
+        else:  # "convnext_large_d", "convnext_large_d_320":
+            self.in_features, self.out_features = model.head.mlp.fc1.in_features, model.head.mlp.fc2.out_features
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == 32:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        elif block_size == 16:
+            self.refiner = ConvUpsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        else:  # block_size == 8
+            self.refiner = nn.Sequential(
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+            )
+    def train(self, mode: bool = True):
+        if self.adapter and mode:
+            # training:
+            self.stem.eval()
+            for idx in range(self.depth):
+                getattr(self, f"stage{idx}").eval()
+                getattr(self, f"adapter{idx}").train()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        for idx in range(self.depth):
+            x = getattr(self, f"stage{idx}")(x)
+            if self.adapter:
+                x = getattr(self, f"adapter{idx}")(x)
+        x = self.refiner(x)
+        return x
+def _convnext(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    norm: str = "none",
+    act: str = "none"
+) -> ConvNeXt:
+    assert not (lora and adapter), "Lora and adapter cannot be used together."
+    model = ConvNeXt(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)) and "refiner" not in name:
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/clip_ebc/mobileclip.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from torch import nn, Tensor
+import open_clip
+from peft import get_peft_model, LoraConfig
+from ..utils import ConvRefine, ConvUpsample, ConvAdapter
+from ..utils import _get_norm_layer, _get_activation
+mobileclip_names_and_weights = {
+    "MobileCLIP-S1": ["datacompdr"],
+    "MobileCLIP-S2": ["datacompdr"],
+}
+refiner_channels = {
+    "MobileCLIP-S1": 1024,
+    "MobileCLIP-S2": 1280,
+}
+refiner_groups = {
+    "MobileCLIP-S1": 2,
+    "MobileCLIP-S2": 2,
+}
+class MobileCLIP(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super().__init__()
+        assert model_name in mobileclip_names_and_weights, f"Model name should be one of {list(mobileclip_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in mobileclip_names_and_weights[model_name], f"Pretrained should be one of {mobileclip_names_and_weights[model_name]}, but got {weight_name}."
+        assert block_size in [32, 16, 8], f"block_size should be one of [32, 16, 8], got {block_size}"
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        self.adapter = adapter
+        if adapter:
+            for param in model.parameters():
+                param.requires_grad = False
+        self.stem = model.trunk.stem
+        self.stages = model.trunk.stages
+        self.depth = len(model.trunk.stages)
+        for idx, stage in enumerate(model.trunk.stages):
+            if adapter:
+                setattr(self, f"adapter{idx}", ConvAdapter(
+                    in_channels=stage.blocks[-1].mlp.fc2.out_channels,
+                    bottleneck_channels=stage.blocks[-1].mlp.fc2.out_channels // adapter_reduction,
+                ))
+        self.final_conv = model.trunk.final_conv
+        self.in_features, self.out_features = model.trunk.head.fc.in_features, model.trunk.head.fc.out_features
+        # refine_block = LightConvRefine if model_name == "MobileCLIP-S1" else ConvRefine
+        # upsample_block = LightConvUpsample if model_name == "MobileCLIP-S1" else ConvUpsample
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == 32:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[model_name],
+            )
+        elif block_size == 16:
+            self.refiner = ConvUpsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        else:  # block_size == 8
+            self.refiner = nn.Sequential(
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+            )
+    def train(self, mode: bool = True):
+        if self.adapter and mode:
+            # training:
+            self.stem.eval()
+            for idx in range(self.depth):
+                getattr(self, f"stage{idx}").eval()
+                getattr(self, f"adapter{idx}").train()
+            self.final_conv.eval()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        for idx in range(self.depth):
+            x = self.stages[idx](x)
+            if self.adapter:
+                x = getattr(self, f"adapter{idx}")(x)
+        x = self.final_conv(x)
+        x = self.refiner(x)
+        return x
+def _mobileclip(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    norm: str = "none",
+    act: str = "none"
+) -> MobileCLIP:
+    assert not (lora and adapter), "Lora and adapter cannot be used together."
+    model = MobileCLIP(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze the BN layers
+        for name, module in model.named_modules() and "refiner" not in name:
+            if isinstance(module, nn.BatchNorm2d):
+                module.requires_grad_(True)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/clip_ebc/model.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+import numpy as np
+from typing import List, Optional, Dict, Tuple
+from copy import deepcopy
+from .vit import vit_names_and_weights, _vit
+from .convnext import convnext_names_and_weights, _convnext
+from .resnet import resnet_names_and_weights, _resnet
+from .mobileclip import mobileclip_names_and_weights, _mobileclip
+from .utils import encode_text, optimize_text_prompts
+from ..utils import conv1x1
+supported_models_and_weights = deepcopy(vit_names_and_weights)
+supported_models_and_weights.update(convnext_names_and_weights)
+supported_models_and_weights.update(resnet_names_and_weights)
+supported_models_and_weights.update(mobileclip_names_and_weights)
+class CLIP_EBC(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: Optional[int] = None,
+        bins: Optional[List[Tuple[float, float]]] = None,
+        bin_centers: Optional[List[float]] = None,
+        zero_inflated: Optional[bool] = True,
+        num_vpt: Optional[int] = None,
+        vpt_drop: Optional[float] = None,
+        input_size: Optional[int] = None,
+        adapter: Optional[bool] = False,
+        adapter_reduction: Optional[int] = None,
+        lora: Optional[bool] = False,
+        lora_rank: Optional[int] = None,
+        lora_alpha: Optional[float] = None,
+        lora_dropout: Optional[float] = None,
+        text_prompts: Optional[Dict[str, List[str]]] = None,
+        norm: Optional[str] = "none",
+        act: Optional[str] = "none",
+    ) -> None:
+        super().__init__()
+        if "mobileclip" in model_name.lower() or "vit" in model_name.lower():
+            model_name = model_name.replace("_", "-")
+        assert model_name in supported_models_and_weights, f"Model name should be one of {list(supported_models_and_weights.keys())}, but got {model_name}."
+        assert weight_name in supported_models_and_weights[model_name], f"Pretrained should be one of {supported_models_and_weights[model_name]}, but got {weight_name}."
+        assert len(bins) == len(bin_centers), f"Expected bins and bin_centers to have the same length, got {len(bins)} and {len(bin_centers)}"
+        assert len(bins) >= 2, f"Expected at least 2 bins, got {len(bins)}"
+        assert all(len(b) == 2 for b in bins), f"Expected bins to be a list of tuples of length 2, got {bins}"
+        bins = [(float(b[0]), float(b[1])) for b in bins]
+        assert all(bin[0] <= p <= bin[1] for bin, p in zip(bins, bin_centers)), f"Expected bin_centers to be within the range of the corresponding bin, got {bins} and {bin_centers}"
+        self.model_name = model_name
+        self.weight_name = weight_name
+        self.block_size = block_size
+        self.bins = bins
+        self.register_buffer("bin_centers", torch.tensor(bin_centers, dtype=torch.float32, requires_grad=False).view(1, -1, 1, 1))
+        self.zero_inflated = zero_inflated
+        self.text_prompts = text_prompts
+        # Image encoder
+        if model_name in vit_names_and_weights:
+            assert num_vpt is not None and num_vpt >= 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
+            vpt_drop = 0. if vpt_drop is None else vpt_drop
+            self.backbone = _vit(
+                model_name=model_name,
+                weight_name=weight_name,
+                num_vpt=num_vpt,
+                vpt_drop=vpt_drop,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                input_size=(input_size, input_size),
+                norm=norm,
+                act=act
+            )
+        elif model_name in convnext_names_and_weights:
+            self.backbone = _convnext(
+                model_name=model_name,
+                weight_name=weight_name,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                norm=norm,
+                act=act
+            )
+        elif model_name in resnet_names_and_weights:
+            self.backbone = _resnet(
+                model_name=model_name,
+                weight_name=weight_name,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                norm=norm,
+                act=act
+            )
+        elif model_name in mobileclip_names_and_weights:
+            self.backbone = _mobileclip(
+                model_name=model_name,
+                weight_name=weight_name,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                norm=norm,
+                act=act
+            )
+        self._build_text_feats()
+        self._build_head()
+    def _build_text_feats(self) -> None:
+        model_name, weight_name = self.model_name, self.weight_name
+        text_prompts = self.text_prompts
+        if text_prompts is None:
+            bins = [b[0] if b[0] == b[1] else b for b in self.bins]  # if the bin is a single value (e.g., [0, 0]), use that value
+            if self.zero_inflated:  # separate 0 from the rest
+                assert bins[0] == 0, f"Expected the first bin to be 0, got {bins[0]}."
+                bins_pi = [0, (1, float("inf"))]
+                bins_lambda = bins[1:]
+                pi_text_prompts = optimize_text_prompts(model_name, weight_name, bins_pi)
+                lambda_text_prompts = optimize_text_prompts(model_name, weight_name, bins_lambda)
+                self.text_prompts = {"pi": pi_text_prompts, "lambda": lambda_text_prompts}
+                pi_text_feats = encode_text(model_name, weight_name, pi_text_prompts)
+                lambda_text_feats = encode_text(model_name, weight_name, lambda_text_prompts)
+                pi_text_feats.requires_grad = False
+                lambda_text_feats.requires_grad = False
+                self.register_buffer("pi_text_feats", pi_text_feats)
+                self.register_buffer("lambda_text_feats", lambda_text_feats)
+            else:
+                text_prompts = optimize_text_prompts(model_name, weight_name, bins)
+                self.text_prompts = text_prompts
+                text_feats = encode_text(model_name, weight_name, text_prompts)
+                text_feats.requires_grad = False
+                self.register_buffer("text_feats", text_feats)
+        else:
+            if self.zero_inflated:
+                assert "pi" in text_prompts and "lambda" in text_prompts, f"Expected text_prompts to have keys 'pi' and 'lambda', got {text_prompts.keys()}."
+                pi_text_prompts = text_prompts["pi"]
+                lambda_text_prompts = text_prompts["lambda"]
+                pi_text_feats = encode_text(model_name, weight_name, pi_text_prompts)
+                lambda_text_feats = encode_text(model_name, weight_name, lambda_text_prompts)
+                pi_text_feats.requires_grad = False
+                lambda_text_feats.requires_grad = False
+                self.register_buffer("pi_text_feats", pi_text_feats)
+                self.register_buffer("lambda_text_feats", lambda_text_feats)
+            else:
+                text_feats = encode_text(model_name, weight_name, text_prompts)
+                text_feats.requires_grad = False
+                self.register_buffer("text_feats", text_feats)
+    def _build_head(self) -> None:
+        in_channels = self.backbone.in_features
+        out_channels = self.backbone.out_features
+        if self.zero_inflated:
+            self.pi_logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07), requires_grad=True)
+            self.lambda_logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07), requires_grad=True)
+            self.pi_head = conv1x1(in_channels, out_channels, bias=False)
+            self.lambda_head = conv1x1(in_channels, out_channels, bias=False)
+        else:
+            self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07), requires_grad=True)
+            self.head = conv1x1(in_channels, out_channels, bias=False)
+    def forward(self, image: Tensor):
+        image_feats = self.backbone(image)
+        # image_feats = F.normalize(image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+        if self.zero_inflated:
+            pi_image_feats, lambda_image_feats = self.pi_head(image_feats), self.lambda_head(image_feats)
+            pi_image_feats = F.normalize(pi_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            lambda_image_feats = F.normalize(lambda_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            pi_text_feats, lambda_text_feats = self.pi_text_feats, self.lambda_text_feats
+            pi_logit_scale, lambda_logit_scale = self.pi_logit_scale.exp(), self.lambda_logit_scale.exp()
+            pi_logit_map = pi_logit_scale * pi_image_feats @ pi_text_feats.t()  # (B, H, W, 2), logits per image
+            lambda_logit_map = lambda_logit_scale * lambda_image_feats @ lambda_text_feats.t()  # (B, H, W, N - 1), logits per image
+            pi_logit_map =  pi_logit_map.permute(0, 3, 1, 2)  # (B, 2, H, W)
+            lambda_logit_map = lambda_logit_map.permute(0, 3, 1, 2)  # (B, N - 1, H, W)
+            lambda_map = (lambda_logit_map.softmax(dim=1) * self.bin_centers[:, 1:]).sum(dim=1, keepdim=True)  # (B, 1, H, W)
+            # pi_logit_map.softmax(dim=1)[:, 0] is the probability of zeros
+            den_map = pi_logit_map.softmax(dim=1)[:, 1:] * lambda_map # (B, 1, H, W)
+            if self.training:
+                return pi_logit_map, lambda_logit_map, lambda_map, den_map
+            else:
+                return den_map
+        else:
+            image_feats = self.head(image_feats)
+            image_feats = F.normalize(image_feats.permute(0, 2, 3, 1), p=2, dim=-1)
+            text_feats = self.text_feats
+            logit_scale = self.logit_scale.exp()
+            logit_map = logit_scale * image_feats @ text_feats.t()  # (B, H, W, N), logits per image
+            logit_map = logit_map.permute(0, 3, 1, 2)  # (B, N, H, W)
+            den_map = (logit_map.softmax(dim=1) * self.bin_centers).sum(dim=1, keepdim=True)  # (B, 1, H, W)
+            if self.training:
+                return logit_map, den_map
+            else:
+                return den_map
+def _clip_ebc(
+    model_name: str,
+    weight_name: str,
+    block_size: Optional[int] = None,
+    bins: Optional[List[Tuple[float, float]]] = None,
+    bin_centers: Optional[List[float]] = None,
+    zero_inflated: Optional[bool] = True,
+    num_vpt: Optional[int] = None,
+    vpt_drop: Optional[float] = None,
+    input_size: Optional[int] = None,
+    adapter: Optional[bool] = False,
+    adapter_reduction: Optional[int] = None,
+    lora: Optional[bool] = False,
+    lora_rank: Optional[int] = None,
+    lora_alpha: Optional[float] = None,
+    lora_dropout: Optional[float] = None,
+    text_prompts: Optional[List[str]] = None,
+    norm: Optional[str] = "none",
+    act: Optional[str] = "none",
+) -> CLIP_EBC:
+    return CLIP_EBC(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        bins=bins,
+        bin_centers=bin_centers,
+        zero_inflated=zero_inflated,
+        num_vpt=num_vpt,
+        vpt_drop=vpt_drop,
+        input_size=input_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        lora=lora,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        text_prompts=text_prompts,
+        norm=norm,
+        act=act,
+    )

models/clip_ebc/resnet.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from torch import nn, Tensor
+import open_clip
+from peft import get_peft_model, LoraConfig
+from ..utils import ConvRefine, ConvUpsample, ConvAdapter
+from ..utils import _get_norm_layer, _get_activation
+resnet_names_and_weights = {
+    "RN50": ["openai", "yfcc15m", "cc12m"],
+    "RN101": ["openai", "yfcc15m", "cc12m"],
+    "RN50x4": ["openai", "yfcc15m", "cc12m"],
+    "RN50x16": ["openai", "yfcc15m", "cc12m"],
+    "RN50x64": ["openai", "yfcc15m", "cc12m"],
+}
+refiner_channels = {
+    "RN50": 2048,
+    "RN101": 2048,
+    "RN50x4": 2560,
+    "RN50x16": 3072,
+    "RN50x64": 4096,
+}
+refiner_groups = {
+    "RN50": refiner_channels["RN50"] // 512,  # 4
+    "RN101": refiner_channels["RN101"] // 512, # 4
+    "RN50x4": refiner_channels["RN50x4"] // 512, # 5
+    "RN50x16": refiner_channels["RN50x16"] // 512, # 6
+    "RN50x64": refiner_channels["RN50x64"] // 512, # 8
+}
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super(ResNet, self).__init__()
+        assert model_name in resnet_names_and_weights, f"Model name should be one of {list(resnet_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in resnet_names_and_weights[model_name], f"Pretrained should be one of {resnet_names_and_weights[model_name]}, but got {weight_name}."
+        assert block_size in [32, 16, 8], f"block_size should be one of [32, 16, 8], got {block_size}"
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        self.adapter = adapter
+        if adapter:
+            for param in model.parameters():
+                param.requires_grad = False
+        # Stem
+        self.conv1 = model.conv1
+        self.bn1 = model.bn1
+        self.act1 = model.act1
+        self.conv2 = model.conv2
+        self.bn2 = model.bn2
+        self.act2 = model.act2
+        self.conv3 = model.conv3
+        self.bn3 = model.bn3
+        self.act3 = model.act3
+        self.avgpool = model.avgpool
+        # Stem: reduction = 4
+        # Layers
+        for idx in range(1, 5):
+            setattr(self, f"layer{idx}", getattr(model, f"layer{idx}"))
+            if adapter:
+                setattr(self, f"adapter{idx}", ConvAdapter(
+                    in_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels,
+                    bottleneck_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels // adapter_reduction,
+                ) if idx < 4 else nn.Identity())  # No adapter for the last layer
+        self.in_features = model.attnpool.c_proj.weight.shape[1]
+        self.out_features = model.attnpool.c_proj.weight.shape[0]
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == 32:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        elif block_size == 16:
+            self.refiner = ConvUpsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        else:  # block_size == 8
+            self.refiner = nn.Sequential(
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+            )
+    def train(self, mode: bool = True):
+        if self.adapter and mode:
+            # training:
+            self.conv1.eval()
+            self.bn1.eval()
+            self.act1.eval()
+            self.conv2.eval()
+            self.bn2.eval()
+            self.act2.eval()
+            self.conv3.eval()
+            self.bn3.eval()
+            self.act3.eval()
+            self.avgpool.eval()
+            for idx in range(1, 5):
+                getattr(self, f"layer{idx}").eval()
+                getattr(self, f"adapter{idx}").train()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def stem(self, x: Tensor) -> Tensor:
+        x = self.act1(self.bn1(self.conv1(x)))
+        x = self.act2(self.bn2(self.conv2(x)))
+        x = self.act3(self.bn3(self.conv3(x)))
+        x = self.avgpool(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        x = self.layer1(x)
+        if self.adapter:
+            x = self.adapter1(x)
+        x = self.layer2(x)
+        if self.adapter:
+            x = self.adapter2(x)
+        x = self.layer3(x)
+        if self.adapter:
+            x = self.adapter3(x)
+        x = self.layer4(x)
+        if self.adapter:
+            x = self.adapter4(x)
+        x = self.refiner(x)
+        return x
+def _resnet(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    norm: str = "none",
+    act: str = "none"
+) -> ResNet:
+    assert not (lora and adapter), "Lora and adapter cannot be used together."
+    model = ResNet(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze BN layers
+        for name, module in model.named_modules():
+            if isinstance(module, nn.BatchNorm2d) and "refiner" not in name:
+                module.requires_grad_(True)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/clip_ebc/utils.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+import open_clip
+from tqdm import tqdm
+import numpy as np
+from typing import Union, Tuple, List
+num_to_word = {
+    "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine",
+    "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen", "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen", "18": "eighteen", "19": "nineteen",
+    "20": "twenty", "21": "twenty-one", "22": "twenty-two", "23": "twenty-three", "24": "twenty-four", "25": "twenty-five", "26": "twenty-six", "27": "twenty-seven", "28": "twenty-eight", "29": "twenty-nine",
+    "30": "thirty", "31": "thirty-one", "32": "thirty-two", "33": "thirty-three", "34": "thirty-four", "35": "thirty-five", "36": "thirty-six", "37": "thirty-seven", "38": "thirty-eight", "39": "thirty-nine",
+    "40": "forty", "41": "forty-one", "42": "forty-two", "43": "forty-three", "44": "forty-four", "45": "forty-five", "46": "forty-six", "47": "forty-seven", "48": "forty-eight", "49": "forty-nine",
+    "50": "fifty", "51": "fifty-one", "52": "fifty-two", "53": "fifty-three", "54": "fifty-four", "55": "fifty-five", "56": "fifty-six", "57": "fifty-seven", "58": "fifty-eight", "59": "fifty-nine",
+    "60": "sixty", "61": "sixty-one", "62": "sixty-two", "63": "sixty-three", "64": "sixty-four", "65": "sixty-five", "66": "sixty-six", "67": "sixty-seven", "68": "sixty-eight", "69": "sixty-nine",
+    "70": "seventy", "71": "seventy-one", "72": "seventy-two", "73": "seventy-three", "74": "seventy-four", "75": "seventy-five", "76": "seventy-six", "77": "seventy-seven", "78": "seventy-eight", "79": "seventy-nine",
+    "80": "eighty", "81": "eighty-one", "82": "eighty-two", "83": "eighty-three", "84": "eighty-four", "85": "eighty-five", "86": "eighty-six", "87": "eighty-seven", "88": "eighty-eight", "89": "eighty-nine",
+    "90": "ninety", "91": "ninety-one", "92": "ninety-two", "93": "ninety-three", "94": "ninety-four", "95": "ninety-five", "96": "ninety-six", "97": "ninety-seven", "98": "ninety-eight", "99": "ninety-nine",
+    "100": "one hundred"
+}
+prefixes = [
+        "",
+        "A photo of", "A block of", "An image of", "A picture of",
+        "There are",
+        "The image contains", "The photo contains", "The picture contains",
+        "The image shows", "The photo shows", "The picture shows",
+    ]
+arabic_numeral = [True, False]
+compares = [
+    "more than", "greater than", "higher than", "larger than", "bigger than", "greater than or equal to",
+    "at least", "no less than", "not less than", "not fewer than", "not lower than", "not smaller than", "not less than or equal to",
+    "over", "above", "beyond", "exceeding", "surpassing",
+]
+suffixes = [
+    "people", "persons", "individuals", "humans", "faces", "heads", "figures", "",
+]
+def num2word(num: Union[int, str]) -> str:
+    """
+    Convert the input number to the corresponding English word. For example, 1 -> "one", 2 -> "two", etc.
+    """
+    num = str(int(num))
+    return num_to_word.get(num, num)
+def format_count(
+    bins: List[Union[float, Tuple[float, float]]],
+) -> List[List[str]]:
+    text_prompts = []
+    for prefix in prefixes:
+        for numeral in arabic_numeral:
+            for compare in compares:
+                for suffix in suffixes:
+                    prompts = []
+                    for bin in bins:
+                        if isinstance(bin, (int, float)):  # count is a single number
+                            count = int(bin)
+                            if count == 0 or count == 1:
+                                count = num2word(count) if not numeral else count
+                                prefix_ = "There is" if prefix == "There are" else prefix
+                                suffix_ = "person" if suffix == "people" else suffix[:-1]
+                                prompt = f"{prefix_} {count} {suffix_}"
+                            else:  # count > 1
+                                count = num2word(count) if not numeral else count
+                                prompt = f"{prefix} {count} {suffix}"
+                        elif bin[1] == float("inf"):  # count is (lower_bound, inf)
+                            count = int(bin[0])
+                            count = num2word(count) if not numeral else count
+                            prompt = f"{prefix} {compare} {count} {suffix}"
+                        else:  # bin is (lower_bound, upper_bound)
+                            left, right = int(bin[0]), int(bin[1])
+                            left, right = num2word(left) if not numeral else left, num2word(right) if not numeral else right
+                            prompt = f"{prefix} between {left} and {right} {suffix}"
+                        # Remove starting and trailing whitespaces
+                        prompt = prompt.strip() + "."
+                        prompts.append(prompt)
+                    text_prompts.append(prompts)
+    return text_prompts
+def encode_text(
+    model_name: str,
+    weight_name: str,
+    text: List[str]
+) -> Tensor:
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    text = open_clip.get_tokenizer(model_name)(text).to(device)
+    model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).to(device)
+    model.eval()
+    with torch.no_grad():
+        text_feats = model.encode_text(text)
+        text_feats = F.normalize(text_feats, p=2, dim=-1).detach().cpu()
+    return text_feats
+def optimize_text_prompts(
+    model_name: str,
+    weight_name: str,
+    flat_bins: List[Union[float, Tuple[float, float]]],
+    batch_size: int = 1024,
+) -> List[str]:
+    text_prompts = format_count(flat_bins)
+    # Find the template that has the smallest average similarity of bin prompts.
+    print("Finding the best setup for text prompts...")
+    text_prompts_ = [prompt for prompts in text_prompts for prompt in prompts]  # flatten the list
+    text_feats = []
+    for i in tqdm(range(0, len(text_prompts_), batch_size)):
+        text_feats.append(encode_text(model_name, weight_name, text_prompts_[i: min(i + batch_size, len(text_prompts_))]))
+    text_feats = torch.cat(text_feats, dim=0)
+    sims = []
+    for idx, prompts in enumerate(text_prompts):
+        text_feats_ = text_feats[idx * len(prompts): (idx + 1) * len(prompts)]
+        sim = torch.mm(text_feats_, text_feats_.T)
+        sim = sim[~torch.eye(sim.shape[0], dtype=bool)].mean().item()
+        sims.append(sim)
+    optimal_prompts = text_prompts[np.argmin(sims)]
+    sim = sims[np.argmin(sims)]
+    print(f"Found the best text prompts: {optimal_prompts} (similarity: {sim:.2f})")
+    return optimal_prompts

models/clip_ebc/vit.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import torch
+from torch import nn, Tensor
+import math
+from einops import rearrange
+import open_clip
+from peft import get_peft_model, LoraConfig
+from typing import Optional, Tuple
+from ..utils import interpolate_pos_embed, ViTAdapter
+# from ..utils import TransformerRefine, TransformerDownsample, TransformerUpsample
+from ..utils import ConvRefine, ConvDownsample, ConvUpsample
+from ..utils import _get_norm_layer, _get_activation
+vit_names_and_weights = {
+    "ViT-B-32": [
+        "openai",
+        "laion400m_e31", "laion400m_e32", "laion2b_e16", "laion2b_s34b_b79k",
+        "datacomp_xl_s13b_b90k", "datacomp_m_s128m_b4k", "datacomp_s_s13m_b4k",
+        "commonpool_m_clip_s128m_b4k", "commonpool_m_laion_s128m_b4k", "commonpool_m_image_s128m_b4k", "commonpool_m_text_s128m_b4k", "commonpool_m_basic_s128m_b4k", "commonpool_m_s128m_b4k",
+        "commonpool_s_clip_s13m_b4k", "commonpool_s_laion_s13m_b4k", "commonpool_s_image_s13m_b4k", "commonpool_s_text_s13m_b4k", "commonpool_s_basic_s13m_b4k", "commonpool_s_s13m_b4k",
+    ],
+    "ViT-B_32-256": ["datacomp_s34b_b86k"],
+    "ViT-B-16": [
+        "openai",
+        "laion400m_e31", "laion400m_e32", "laion2b_s34b_b88k",
+        "datacomp_xl_s13b_b90k", "datacomp_l_s1b_b8k",
+        "commonpool_l_clip_s1b_b8k", "commonpool_l_laion_s1b_b8k", "commonpool_l_image_s1b_b8k", "commonpool_l_text_s1b_b8k", "commonpool_l_basic_s1b_b8k", "commonpool_l_s1b_b8k",
+        "dfn2b"
+    ],
+    "ViT-L-14": [
+        "openai",
+        "laion400m_e31", "laion400m_e32", "laion2b_s32b_b82k",
+        "datacomp_xl_s13b_b90k",
+        "commonpool_xl_clip_s13b_b90k", "commonpool_xl_laion_s13b_b90k", "commonpool_xl_s13b_b90k"
+    ],
+    "ViT-L-14-336": ["openai"],
+    "ViT-H-14": ["laion2b_s32b_b79k"],
+    "ViT-g-14": ["laion2b_s12b_b42k", "laion2b_s34b_b88k"],
+    "ViT-bigG-14": ["laion2b_s39b_b160k"],
+}
+refiner_channels = {
+    "ViT-B-32": 768,
+    "ViT-B-32-256": 768,
+    "ViT-B-16": 768,
+    "ViT-L-14": 1024,
+    "ViT-L-14-336": 1024,
+    "ViT-H-14": 1280,
+    "ViT-g-14": 1408,
+    "ViT-bigG-14": 1664,
+}
+refiner_groups = {
+    "ViT-B-32": 1,
+    "ViT-B-32-256": 1,
+    "ViT-B-16": 1,
+    "ViT-L-14": 1,
+    "ViT-L-14-336": 1,
+    "ViT-H-14": 1,
+    "ViT-g-14": refiner_channels["ViT-g-14"] // 704,  # 2
+    "ViT-bigG-14": refiner_channels["ViT-bigG-14"] // 416,  # 4
+}
+class ViT(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        num_vpt: int = 32,
+        vpt_drop: float = 0.0,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        input_size: Optional[Tuple[int, int]] = None,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super(ViT, self).__init__()
+        assert model_name in vit_names_and_weights, f"Model name should be one of {list(vit_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in vit_names_and_weights[model_name], f"Pretrained should be one of {vit_names_and_weights[model_name]}, but got {weight_name}."
+        if adapter:
+            assert num_vpt is None or num_vpt == 0, "num_vpt should be None or 0 when using adapter."
+            assert vpt_drop is None or vpt_drop == 0.0, "vpt_drop should be None or 0.0 when using adapter."
+        else:
+            assert num_vpt > 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
+            assert 0.0 <= vpt_drop < 1.0, f"VPT dropout should be in [0.0, 1.0), but got {vpt_drop}."
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        self.num_vpt = num_vpt
+        self.vpt_drop = vpt_drop
+        self.adapter = adapter
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        # Always freeze the parameters of the model
+        for param in model.parameters():
+            param.requires_grad = False
+        # Setup the model
+        self.input_size = input_size if input_size is not None else model.image_size
+        self.pretrain_size = model.image_size
+        self.patch_size = model.patch_size
+        self.class_embedding = model.class_embedding
+        self.positional_embedding = model.positional_embedding
+        self.embed_dim = model.class_embedding.shape[-1]
+        self.conv1 = model.conv1
+        self.ln_pre = model.ln_pre
+        self.resblocks = model.transformer.resblocks
+        self.num_layers = len(self.resblocks)
+        self.ln_post = model.ln_post
+        # Setup VPT tokens
+        val = math.sqrt(6. / float(3 * self.patch_size[0] + self.embed_dim))
+        for idx in range(self.num_layers):
+            if self.adapter:
+                setattr(self, f"adapter{idx}", ViTAdapter(
+                    in_channels=self.embed_dim,
+                    bottleneck_channels=self.embed_dim // adapter_reduction,
+                ))
+            else:
+                setattr(self, f"vpt_{idx}", nn.Parameter(torch.empty(self.num_vpt, self.embed_dim)))
+                nn.init.uniform_(getattr(self, f"vpt_{idx}"), -val, val)
+                setattr(self, f"vpt_drop_{idx}", nn.Dropout(self.vpt_drop))
+        # Adjust the positional embedding to match the new input size
+        self._adjust_pos_embed()
+        in_features, out_features = model.proj.shape
+        self.in_features = in_features
+        self.out_features = out_features
+        patch_size = self.patch_size[0]
+        if patch_size in [16, 32]:
+            assert block_size in [8, 16, 32], f"Patch size is 32, but got block size {block_size}."
+        else:  # patch_size == 14
+            assert block_size in [7, 14, 28], f"Patch size is 14, but got block size {block_size}."
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == patch_size:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        elif block_size < patch_size:  # upsample
+            if block_size == 8 and patch_size == 32:
+                self.refiner = nn.Sequential(
+                    ConvUpsample(
+                        in_channels=self.in_features,
+                        out_channels=self.in_features,
+                        norm_layer=norm_layer,
+                        activation=activation,
+                        groups=refiner_groups[self.model_name],
+                    ),
+                    ConvUpsample(
+                        in_channels=self.in_features,
+                        out_channels=self.in_features,
+                        norm_layer=norm_layer,
+                        activation=activation,
+                        groups=refiner_groups[self.model_name],
+                    ),
+                )
+            else:
+                self.refiner = ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                )
+        else:  # downsample
+            assert block_size // patch_size == 2, f"Block size {block_size} should be 2 times the patch size {patch_size}."
+            self.refiner = ConvDownsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+    def _adjust_pos_embed(self) -> Tensor:
+        """
+        Adjust the positional embedding to match the spatial resolution of the feature map.
+        Args:
+            orig_h, orig_w: The original spatial resolution of the image.
+            new_h, new_w: The new spatial resolution of the image.
+        """
+        self.positional_embedding = nn.Parameter(self._interpolate_pos_embed(self.pretrain_size[0], self.pretrain_size[1], self.input_size[0], self.input_size[1]), requires_grad=False)
+    def _interpolate_pos_embed(self, orig_h: int, orig_w: int, new_h: int, new_w: int) -> Tensor:
+        """
+        Interpolate the positional embedding to match the spatial resolution of the feature map.
+        Args:
+            orig_h, orig_w: The original spatial resolution of the image.
+            new_h, new_w: The new spatial resolution of the image.
+        """
+        if (orig_h, orig_w) == (new_h, new_w):
+            return self.positional_embedding
+        orig_h_patches, orig_w_patches = orig_h // self.patch_size[0], orig_w // self.patch_size[1]
+        new_h_patches, new_w_patches = new_h // self.patch_size[0], new_w // self.patch_size[1]
+        class_pos_embed, patch_pos_embed = self.positional_embedding[:1, :], self.positional_embedding[1:, :]
+        patch_pos_embed = rearrange(patch_pos_embed, "(h w) d -> d h w", h=orig_h_patches, w=orig_w_patches)
+        patch_pos_embed = interpolate_pos_embed(patch_pos_embed, size=(new_h_patches, new_w_patches))
+        patch_pos_embed = rearrange(patch_pos_embed, "d h w -> (h w) d")
+        pos_embed = torch.cat((class_pos_embed, patch_pos_embed), dim=0)
+        return pos_embed
+    def train(self, mode: bool = True):
+        if mode:
+            # training:
+            self.conv1.eval()
+            self.ln_pre.eval()
+            self.resblocks.eval()
+            self.ln_post.eval()
+            for idx in range(self.num_layers):
+                getattr(self, f"vpt_drop_{idx}").train()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def _prepare_vpt(self, layer: int, batch_size: int, device: torch.device) -> Tensor:
+        vpt = getattr(self, f"vpt_{layer}").unsqueeze(0).expand(batch_size, -1, -1).to(device)  # (batch_size, num_vpt, embed_dim)
+        vpt = getattr(self, f"vpt_drop_{layer}")(vpt)
+        return vpt
+    def _forward_patch_embed(self, x: Tensor) -> Tensor:
+        # This step performs 1) embed x into patches; 2) append the class token; 3) add positional embeddings.
+        assert len(x.shape) == 4, f"Expected input to have shape (batch_size, 3, height, width), but got {x.shape}"
+        batch_size, _, height, width = x.shape
+        # Step 1: Embed x into patches
+        x = self.conv1(x)
+        # Step 2: Append the class token
+        class_embedding = self.class_embedding.expand(batch_size, 1, -1)
+        x = rearrange(x, "b d h w -> b (h w) d")
+        x = torch.cat([class_embedding, x], dim=1)
+        # Step 3: Add positional embeddings
+        pos_embed = self._interpolate_pos_embed(orig_h=self.input_size[0], orig_w=self.input_size[1], new_h=height, new_w=width).expand(batch_size, -1, -1)
+        x = x + pos_embed
+        x = self.ln_pre(x)
+        return x
+    def _forward_vpt(self, x: Tensor, idx: int) -> Tensor:
+        batch_size = x.shape[0]
+        device = x.device
+        # Assemble
+        vpt = self._prepare_vpt(idx, batch_size, device)
+        x = torch.cat([
+            x[:, :1, :],  # class token
+            vpt,
+            x[:, 1:, :]  # patches
+        ], dim=1)
+        # Forward
+        x = self.resblocks[idx](x)
+        # Disassemble
+        x = torch.cat([
+            x[:, :1, :],  # class token
+            x[:, 1 + self.num_vpt:, :]  # patches
+        ], dim=1)
+        return x
+    def _forward_adapter(self, x: Tensor, idx: int) -> Tensor:
+        return getattr(self, f"adapter{idx}")(x)
+    def forward_encoder(self, x: Tensor) -> Tensor:
+        x = self._forward_patch_embed(x)
+        for idx in range(self.num_layers):
+            x = self._forward_adapter(x, idx) if self.adapter else self._forward_vpt(x, idx)
+        x = self.ln_post(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        orig_h, orig_w = x.shape[-2:]
+        num_patches_h, num_patches_w = orig_h // self.patch_size[0], orig_w // self.patch_size[1]
+        x = self.forward_encoder(x)
+        x = x[:, 1:, :]  # remove the class token
+        x = rearrange(x, "b (h w) d -> b d h w", h=num_patches_h, w=num_patches_w)
+        x = self.refiner(x)
+        return x
+def _vit(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    num_vpt: int = 32,
+    vpt_drop: float = 0.1,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    input_size: Optional[Tuple[int, int]] = None,
+    norm: str = "none",
+    act: str = "none"
+) -> ViT:
+    assert not (lora and adapter), "LoRA and adapter cannot be used together."
+    model = ViT(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        num_vpt=num_vpt,
+        vpt_drop=vpt_drop,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        input_size=input_size,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d, nn.MultiheadAttention)) and "refiner" not in name:
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/ebc/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import EBC, _ebc
2	+
3	+ __all__ = ["EBC", "_ebc"]