Upload folder using huggingface_hub
Browse files- mdiff4str_base/best.pth +3 -0
- mdiff4str_base/config.yml +206 -0
- mdiff4str_base/train.log +0 -0
- mdiff4str_base_ch/best.pth +3 -0
- mdiff4str_base_ch/config.yml +200 -0
- mdiff4str_base_ch/train.log +0 -0
- mdiff4str_base_syn/best.pth +3 -0
- mdiff4str_base_syn/config.yml +207 -0
- mdiff4str_base_syn/train.log +0 -0
- mdiff4str_small/best.pth +3 -0
- mdiff4str_small/config.yml +200 -0
- mdiff4str_small/train.log +0 -0
- mdiff4str_small_ch/best.pth +3 -0
- mdiff4str_small_ch/config.yml +200 -0
- mdiff4str_small_ch/train.log +0 -0
- readme.md +135 -0
mdiff4str_base/best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b23ecc56cf750a7661697e3d36c826ae35f25eb3457b43ff7880058b60b83cb
|
| 3 |
+
size 135534174
|
mdiff4str_base/config.yml
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 40
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_llada_test_lr00005_fs_reflect_onedecoder_semiar_withoutmask_sample3/
|
| 7 |
+
save_epoch_step:
|
| 8 |
+
- 10
|
| 9 |
+
- 1
|
| 10 |
+
eval_batch_step:
|
| 11 |
+
- 0
|
| 12 |
+
- 500
|
| 13 |
+
eval_epoch_step:
|
| 14 |
+
- 0
|
| 15 |
+
- 1
|
| 16 |
+
cal_metric_during_train: false
|
| 17 |
+
pretrained_model: null
|
| 18 |
+
checkpoints: null
|
| 19 |
+
use_tensorboard: false
|
| 20 |
+
infer_img: ./img_44_1.jpg
|
| 21 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 22 |
+
max_text_length: 25
|
| 23 |
+
use_space_char: false
|
| 24 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_nrtr.txt
|
| 25 |
+
use_amp: true
|
| 26 |
+
grad_clip_val: 20.0
|
| 27 |
+
distributed: true
|
| 28 |
+
Optimizer:
|
| 29 |
+
name: AdamW
|
| 30 |
+
lr: 0.0005
|
| 31 |
+
weight_decay: 0.05
|
| 32 |
+
filter_bias_and_bn: true
|
| 33 |
+
LRScheduler:
|
| 34 |
+
name: OneCycleLR
|
| 35 |
+
warmup_epoch: 1.5
|
| 36 |
+
cycle_momentum: false
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: NRTR
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform: null
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRv2LNConvTwo33
|
| 44 |
+
use_pos_embed: false
|
| 45 |
+
dims:
|
| 46 |
+
- 128
|
| 47 |
+
- 256
|
| 48 |
+
- 384
|
| 49 |
+
depths:
|
| 50 |
+
- 6
|
| 51 |
+
- 6
|
| 52 |
+
- 6
|
| 53 |
+
num_heads:
|
| 54 |
+
- 4
|
| 55 |
+
- 8
|
| 56 |
+
- 12
|
| 57 |
+
mixer:
|
| 58 |
+
- - Conv
|
| 59 |
+
- Conv
|
| 60 |
+
- Conv
|
| 61 |
+
- Conv
|
| 62 |
+
- Conv
|
| 63 |
+
- Conv
|
| 64 |
+
- - Conv
|
| 65 |
+
- Conv
|
| 66 |
+
- FGlobal
|
| 67 |
+
- Global
|
| 68 |
+
- Global
|
| 69 |
+
- Global
|
| 70 |
+
- - Global
|
| 71 |
+
- Global
|
| 72 |
+
- Global
|
| 73 |
+
- Global
|
| 74 |
+
- Global
|
| 75 |
+
- Global
|
| 76 |
+
local_k:
|
| 77 |
+
- - 5
|
| 78 |
+
- 5
|
| 79 |
+
- - 5
|
| 80 |
+
- 5
|
| 81 |
+
- - -1
|
| 82 |
+
- -1
|
| 83 |
+
sub_k:
|
| 84 |
+
- - 1
|
| 85 |
+
- 1
|
| 86 |
+
- - 2
|
| 87 |
+
- 1
|
| 88 |
+
- - -1
|
| 89 |
+
- -1
|
| 90 |
+
last_stage: false
|
| 91 |
+
feat2d: false
|
| 92 |
+
Decoder:
|
| 93 |
+
name: MDiffDecoder
|
| 94 |
+
num_decoder_layers: 6
|
| 95 |
+
nhead: 6
|
| 96 |
+
max_len: 25
|
| 97 |
+
parallel_decoding: false
|
| 98 |
+
autoregressive_decoding: false
|
| 99 |
+
low_confidence_decoding: false
|
| 100 |
+
random_mask_decoding: false
|
| 101 |
+
semi_autoregressive_decoding: true
|
| 102 |
+
cloze_mask_decoding: false
|
| 103 |
+
sampler_step: 3
|
| 104 |
+
sample_k: 3
|
| 105 |
+
Loss:
|
| 106 |
+
name: NoneLoss
|
| 107 |
+
PostProcess:
|
| 108 |
+
name: ARLabelDecode
|
| 109 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 110 |
+
use_space_char: false
|
| 111 |
+
Metric:
|
| 112 |
+
name: RecMetric
|
| 113 |
+
main_indicator: acc
|
| 114 |
+
is_filter: true
|
| 115 |
+
Train:
|
| 116 |
+
dataset:
|
| 117 |
+
name: RatioDataSetTVResize
|
| 118 |
+
ds_width: true
|
| 119 |
+
padding: false
|
| 120 |
+
data_dir_list:
|
| 121 |
+
- /data/Union14M-L-LMDB-Filtered/filter_train_challenging
|
| 122 |
+
- /data/Union14M-L-LMDB-Filtered/filter_train_hard
|
| 123 |
+
- /data/Union14M-L-LMDB-Filtered/filter_train_medium
|
| 124 |
+
- /data/Union14M-L-LMDB-Filtered/filter_train_normal
|
| 125 |
+
- /data/Union14M-L-LMDB-Filtered/filter_train_easy
|
| 126 |
+
transforms:
|
| 127 |
+
- DecodeImagePIL:
|
| 128 |
+
img_mode: RGB
|
| 129 |
+
- PARSeqAugPIL: null
|
| 130 |
+
- LLaDALabelEncode:
|
| 131 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 132 |
+
use_space_char: false
|
| 133 |
+
max_text_length: 25
|
| 134 |
+
train_all_layer: true
|
| 135 |
+
sample_num: 3
|
| 136 |
+
- KeepKeys:
|
| 137 |
+
keep_keys:
|
| 138 |
+
- image
|
| 139 |
+
- label
|
| 140 |
+
- reflect_ids
|
| 141 |
+
- noisy_batch
|
| 142 |
+
- masked_indices
|
| 143 |
+
- p_mask
|
| 144 |
+
- length
|
| 145 |
+
sampler:
|
| 146 |
+
name: RatioSampler
|
| 147 |
+
scales:
|
| 148 |
+
- - 128
|
| 149 |
+
- 32
|
| 150 |
+
first_bs: 256
|
| 151 |
+
fix_bs: false
|
| 152 |
+
divided_factor:
|
| 153 |
+
- 4
|
| 154 |
+
- 16
|
| 155 |
+
is_training: true
|
| 156 |
+
loader:
|
| 157 |
+
shuffle: true
|
| 158 |
+
batch_size_per_card: 256
|
| 159 |
+
drop_last: true
|
| 160 |
+
max_ratio: 4
|
| 161 |
+
num_workers: 4
|
| 162 |
+
Eval:
|
| 163 |
+
dataset:
|
| 164 |
+
name: RatioDataSetTVResize
|
| 165 |
+
ds_width: true
|
| 166 |
+
padding: false
|
| 167 |
+
data_dir_list:
|
| 168 |
+
- /data/evaluation/CUTE80
|
| 169 |
+
- /data/evaluation/IC13_857
|
| 170 |
+
- /data/evaluation/IC15_1811
|
| 171 |
+
- /data/evaluation/IIIT5k_3000
|
| 172 |
+
- /data/evaluation/SVT
|
| 173 |
+
- /data/evaluation/SVTP
|
| 174 |
+
transforms:
|
| 175 |
+
- DecodeImagePIL:
|
| 176 |
+
img_mode: RGB
|
| 177 |
+
- ARLabelEncode:
|
| 178 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 179 |
+
use_space_char: false
|
| 180 |
+
max_text_length: 25
|
| 181 |
+
- KeepKeys:
|
| 182 |
+
keep_keys:
|
| 183 |
+
- image
|
| 184 |
+
- label
|
| 185 |
+
- length
|
| 186 |
+
sampler:
|
| 187 |
+
name: RatioSampler
|
| 188 |
+
scales:
|
| 189 |
+
- - 128
|
| 190 |
+
- 32
|
| 191 |
+
first_bs: 256
|
| 192 |
+
fix_bs: false
|
| 193 |
+
divided_factor:
|
| 194 |
+
- 4
|
| 195 |
+
- 16
|
| 196 |
+
is_training: false
|
| 197 |
+
loader:
|
| 198 |
+
shuffle: false
|
| 199 |
+
drop_last: false
|
| 200 |
+
batch_size_per_card: 256
|
| 201 |
+
max_ratio: 4
|
| 202 |
+
num_workers: 4
|
| 203 |
+
filename: svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3
|
| 204 |
+
config: configs/rec/llada/svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3.yml
|
| 205 |
+
local_rank: '2'
|
| 206 |
+
eval: true
|
mdiff4str_base/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mdiff4str_base_ch/best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b1172a656542780530c968a00a11fcc29b665f21051e07ac75dd41d4f7c40084
|
| 3 |
+
size 155591006
|
mdiff4str_base_ch/config.yml
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 100
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/ch/svtrv2_llada_test_lr00005_fs_reflect_onedecoder_semiar_withoutmask_sample3_ch/
|
| 7 |
+
save_epoch_step:
|
| 8 |
+
- 10
|
| 9 |
+
- 1
|
| 10 |
+
eval_batch_step:
|
| 11 |
+
- 0
|
| 12 |
+
- 2000
|
| 13 |
+
eval_epoch_step:
|
| 14 |
+
- 0
|
| 15 |
+
- 1
|
| 16 |
+
cal_metric_during_train: false
|
| 17 |
+
pretrained_model: null
|
| 18 |
+
checkpoints: null
|
| 19 |
+
use_tensorboard: false
|
| 20 |
+
infer_img: ./img_44_1.jpg
|
| 21 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 22 |
+
max_text_length: 25
|
| 23 |
+
use_space_char: false
|
| 24 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_nrtr.txt
|
| 25 |
+
use_amp: true
|
| 26 |
+
grad_clip_val: 20.0
|
| 27 |
+
distributed: true
|
| 28 |
+
Optimizer:
|
| 29 |
+
name: AdamW
|
| 30 |
+
lr: 0.0005
|
| 31 |
+
weight_decay: 0.05
|
| 32 |
+
filter_bias_and_bn: true
|
| 33 |
+
LRScheduler:
|
| 34 |
+
name: OneCycleLR
|
| 35 |
+
warmup_epoch: 5
|
| 36 |
+
cycle_momentum: false
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: NRTR
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform: null
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRv2LNConvTwo33
|
| 44 |
+
use_pos_embed: false
|
| 45 |
+
dims:
|
| 46 |
+
- 128
|
| 47 |
+
- 256
|
| 48 |
+
- 384
|
| 49 |
+
depths:
|
| 50 |
+
- 6
|
| 51 |
+
- 6
|
| 52 |
+
- 6
|
| 53 |
+
num_heads:
|
| 54 |
+
- 4
|
| 55 |
+
- 8
|
| 56 |
+
- 12
|
| 57 |
+
mixer:
|
| 58 |
+
- - Conv
|
| 59 |
+
- Conv
|
| 60 |
+
- Conv
|
| 61 |
+
- Conv
|
| 62 |
+
- Conv
|
| 63 |
+
- Conv
|
| 64 |
+
- - Conv
|
| 65 |
+
- Conv
|
| 66 |
+
- FGlobal
|
| 67 |
+
- Global
|
| 68 |
+
- Global
|
| 69 |
+
- Global
|
| 70 |
+
- - Global
|
| 71 |
+
- Global
|
| 72 |
+
- Global
|
| 73 |
+
- Global
|
| 74 |
+
- Global
|
| 75 |
+
- Global
|
| 76 |
+
local_k:
|
| 77 |
+
- - 5
|
| 78 |
+
- 5
|
| 79 |
+
- - 5
|
| 80 |
+
- 5
|
| 81 |
+
- - -1
|
| 82 |
+
- -1
|
| 83 |
+
sub_k:
|
| 84 |
+
- - 1
|
| 85 |
+
- 1
|
| 86 |
+
- - 2
|
| 87 |
+
- 1
|
| 88 |
+
- - -1
|
| 89 |
+
- -1
|
| 90 |
+
last_stage: false
|
| 91 |
+
feat2d: false
|
| 92 |
+
Decoder:
|
| 93 |
+
name: MDiffDecoder
|
| 94 |
+
num_decoder_layers: 6
|
| 95 |
+
nhead: 6
|
| 96 |
+
max_len: 25
|
| 97 |
+
parallel_decoding: false
|
| 98 |
+
autoregressive_decoding: false
|
| 99 |
+
low_confidence_decoding: false
|
| 100 |
+
random_mask_decoding: false
|
| 101 |
+
semi_autoregressive_decoding: true
|
| 102 |
+
cloze_mask_decoding: false
|
| 103 |
+
sampler_step: 3
|
| 104 |
+
sample_k: 3
|
| 105 |
+
Loss:
|
| 106 |
+
name: NoneLoss
|
| 107 |
+
PostProcess:
|
| 108 |
+
name: ARLabelDecode
|
| 109 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 110 |
+
use_space_char: false
|
| 111 |
+
Metric:
|
| 112 |
+
name: RecMetric
|
| 113 |
+
main_indicator: acc
|
| 114 |
+
is_filter: false
|
| 115 |
+
Train:
|
| 116 |
+
dataset:
|
| 117 |
+
name: RatioDataSetTVResize
|
| 118 |
+
ds_width: true
|
| 119 |
+
padding: false
|
| 120 |
+
data_dir_list:
|
| 121 |
+
- ../benchmark_bctr/benchmark_bctr_train/document_train
|
| 122 |
+
- ../benchmark_bctr/benchmark_bctr_train/handwriting_train
|
| 123 |
+
- ../benchmark_bctr/benchmark_bctr_train/scene_train
|
| 124 |
+
- ../benchmark_bctr/benchmark_bctr_train/web_train
|
| 125 |
+
transforms:
|
| 126 |
+
- DecodeImagePIL:
|
| 127 |
+
img_mode: RGB
|
| 128 |
+
- PARSeqAugPIL: null
|
| 129 |
+
- LLaDALabelEncode:
|
| 130 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 131 |
+
use_space_char: false
|
| 132 |
+
max_text_length: 25
|
| 133 |
+
train_all_layer: true
|
| 134 |
+
sample_num: 3
|
| 135 |
+
- KeepKeys:
|
| 136 |
+
keep_keys:
|
| 137 |
+
- image
|
| 138 |
+
- label
|
| 139 |
+
- reflect_ids
|
| 140 |
+
- noisy_batch
|
| 141 |
+
- masked_indices
|
| 142 |
+
- p_mask
|
| 143 |
+
- length
|
| 144 |
+
sampler:
|
| 145 |
+
name: RatioSampler
|
| 146 |
+
scales:
|
| 147 |
+
- - 128
|
| 148 |
+
- 32
|
| 149 |
+
first_bs: 128
|
| 150 |
+
fix_bs: false
|
| 151 |
+
divided_factor:
|
| 152 |
+
- 4
|
| 153 |
+
- 16
|
| 154 |
+
is_training: true
|
| 155 |
+
loader:
|
| 156 |
+
shuffle: true
|
| 157 |
+
batch_size_per_card: 128
|
| 158 |
+
drop_last: true
|
| 159 |
+
max_ratio: 8
|
| 160 |
+
num_workers: 4
|
| 161 |
+
Eval:
|
| 162 |
+
dataset:
|
| 163 |
+
name: RatioDataSetTVResize
|
| 164 |
+
ds_width: true
|
| 165 |
+
padding: false
|
| 166 |
+
data_dir_list:
|
| 167 |
+
- ../benchmark_bctr/benchmark_bctr_test/scene_test
|
| 168 |
+
transforms:
|
| 169 |
+
- DecodeImagePIL:
|
| 170 |
+
img_mode: RGB
|
| 171 |
+
- ARLabelEncode:
|
| 172 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 173 |
+
use_space_char: false
|
| 174 |
+
max_text_length: 25
|
| 175 |
+
- KeepKeys:
|
| 176 |
+
keep_keys:
|
| 177 |
+
- image
|
| 178 |
+
- label
|
| 179 |
+
- length
|
| 180 |
+
sampler:
|
| 181 |
+
name: RatioSampler
|
| 182 |
+
scales:
|
| 183 |
+
- - 128
|
| 184 |
+
- 32
|
| 185 |
+
first_bs: 128
|
| 186 |
+
fix_bs: false
|
| 187 |
+
divided_factor:
|
| 188 |
+
- 4
|
| 189 |
+
- 16
|
| 190 |
+
is_training: false
|
| 191 |
+
loader:
|
| 192 |
+
shuffle: false
|
| 193 |
+
drop_last: false
|
| 194 |
+
batch_size_per_card: 128
|
| 195 |
+
max_ratio: 8
|
| 196 |
+
num_workers: 4
|
| 197 |
+
filename: svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3_ch
|
| 198 |
+
config: configs/rec/llada/svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3_ch.yml
|
| 199 |
+
local_rank: '4'
|
| 200 |
+
eval: true
|
mdiff4str_base_ch/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mdiff4str_base_syn/best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e29a9c4ab640f44ce9bca6816a5e9e7664541bd2646916af277070aa5665fe6
|
| 3 |
+
size 135534174
|
mdiff4str_base_syn/config.yml
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 60
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/syn/svtrv2_llada_test_lr00005_fs_reflect_onedecoder_semiar_withoutmask_sample3_syn_lr00005_gpu8/
|
| 7 |
+
save_epoch_step:
|
| 8 |
+
- 10
|
| 9 |
+
- 1
|
| 10 |
+
eval_batch_step:
|
| 11 |
+
- 0
|
| 12 |
+
- 500
|
| 13 |
+
eval_epoch_step:
|
| 14 |
+
- 0
|
| 15 |
+
- 1
|
| 16 |
+
cal_metric_during_train: false
|
| 17 |
+
pretrained_model: null
|
| 18 |
+
checkpoints: null
|
| 19 |
+
use_tensorboard: false
|
| 20 |
+
infer_img: ./img_44_1.jpg
|
| 21 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 22 |
+
max_text_length: 25
|
| 23 |
+
use_space_char: false
|
| 24 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_nrtr.txt
|
| 25 |
+
use_amp: true
|
| 26 |
+
grad_clip_val: 20.0
|
| 27 |
+
distributed: true
|
| 28 |
+
Optimizer:
|
| 29 |
+
name: AdamW
|
| 30 |
+
lr: 0.0005
|
| 31 |
+
weight_decay: 0.05
|
| 32 |
+
filter_bias_and_bn: true
|
| 33 |
+
LRScheduler:
|
| 34 |
+
name: OneCycleLR
|
| 35 |
+
warmup_epoch: 3
|
| 36 |
+
cycle_momentum: false
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: NRTR
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform: null
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRv2LNConvTwo33
|
| 44 |
+
use_pos_embed: false
|
| 45 |
+
dims:
|
| 46 |
+
- 128
|
| 47 |
+
- 256
|
| 48 |
+
- 384
|
| 49 |
+
depths:
|
| 50 |
+
- 6
|
| 51 |
+
- 6
|
| 52 |
+
- 6
|
| 53 |
+
num_heads:
|
| 54 |
+
- 4
|
| 55 |
+
- 8
|
| 56 |
+
- 12
|
| 57 |
+
mixer:
|
| 58 |
+
- - Conv
|
| 59 |
+
- Conv
|
| 60 |
+
- Conv
|
| 61 |
+
- Conv
|
| 62 |
+
- Conv
|
| 63 |
+
- Conv
|
| 64 |
+
- - Conv
|
| 65 |
+
- Conv
|
| 66 |
+
- FGlobal
|
| 67 |
+
- Global
|
| 68 |
+
- Global
|
| 69 |
+
- Global
|
| 70 |
+
- - Global
|
| 71 |
+
- Global
|
| 72 |
+
- Global
|
| 73 |
+
- Global
|
| 74 |
+
- Global
|
| 75 |
+
- Global
|
| 76 |
+
local_k:
|
| 77 |
+
- - 5
|
| 78 |
+
- 5
|
| 79 |
+
- - 5
|
| 80 |
+
- 5
|
| 81 |
+
- - -1
|
| 82 |
+
- -1
|
| 83 |
+
sub_k:
|
| 84 |
+
- - 1
|
| 85 |
+
- 1
|
| 86 |
+
- - 2
|
| 87 |
+
- 1
|
| 88 |
+
- - -1
|
| 89 |
+
- -1
|
| 90 |
+
last_stage: false
|
| 91 |
+
feat2d: false
|
| 92 |
+
Decoder:
|
| 93 |
+
name: MDiffDecoder
|
| 94 |
+
num_decoder_layers: 6
|
| 95 |
+
nhead: 6
|
| 96 |
+
max_len: 25
|
| 97 |
+
parallel_decoding: false
|
| 98 |
+
autoregressive_decoding: false
|
| 99 |
+
low_confidence_decoding: false
|
| 100 |
+
random_mask_decoding: false
|
| 101 |
+
semi_autoregressive_decoding: true
|
| 102 |
+
cloze_mask_decoding: false
|
| 103 |
+
sampler_step: 3
|
| 104 |
+
sample_k: 3
|
| 105 |
+
Loss:
|
| 106 |
+
name: NoneLoss
|
| 107 |
+
PostProcess:
|
| 108 |
+
name: ARLabelDecode
|
| 109 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 110 |
+
use_space_char: false
|
| 111 |
+
Metric:
|
| 112 |
+
name: RecMetric
|
| 113 |
+
main_indicator: acc
|
| 114 |
+
is_filter: true
|
| 115 |
+
Train:
|
| 116 |
+
dataset:
|
| 117 |
+
name: RatioDataSetTVResize
|
| 118 |
+
ds_width: true
|
| 119 |
+
padding: false
|
| 120 |
+
syn: true
|
| 121 |
+
data_dir_list:
|
| 122 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_challenging
|
| 123 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_hard
|
| 124 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_medium
|
| 125 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_normal
|
| 126 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_easy
|
| 127 |
+
transforms:
|
| 128 |
+
- DecodeImagePIL:
|
| 129 |
+
img_mode: RGB
|
| 130 |
+
- PARSeqAugPIL: null
|
| 131 |
+
- LLaDALabelEncode:
|
| 132 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 133 |
+
use_space_char: false
|
| 134 |
+
max_text_length: 25
|
| 135 |
+
train_all_layer: true
|
| 136 |
+
sample_num: 3
|
| 137 |
+
- KeepKeys:
|
| 138 |
+
keep_keys:
|
| 139 |
+
- image
|
| 140 |
+
- label
|
| 141 |
+
- reflect_ids
|
| 142 |
+
- noisy_batch
|
| 143 |
+
- masked_indices
|
| 144 |
+
- p_mask
|
| 145 |
+
- length
|
| 146 |
+
sampler:
|
| 147 |
+
name: RatioSampler
|
| 148 |
+
scales:
|
| 149 |
+
- - 128
|
| 150 |
+
- 32
|
| 151 |
+
first_bs: 256
|
| 152 |
+
fix_bs: false
|
| 153 |
+
divided_factor:
|
| 154 |
+
- 4
|
| 155 |
+
- 16
|
| 156 |
+
is_training: true
|
| 157 |
+
loader:
|
| 158 |
+
shuffle: true
|
| 159 |
+
batch_size_per_card: 256
|
| 160 |
+
drop_last: true
|
| 161 |
+
max_ratio: 4
|
| 162 |
+
num_workers: 4
|
| 163 |
+
Eval:
|
| 164 |
+
dataset:
|
| 165 |
+
name: RatioDataSetTVResize
|
| 166 |
+
ds_width: true
|
| 167 |
+
padding: false
|
| 168 |
+
data_dir_list:
|
| 169 |
+
- ../evaluation/CUTE80
|
| 170 |
+
- ../evaluation/IC13_857
|
| 171 |
+
- ../evaluation/IC15_1811
|
| 172 |
+
- ../evaluation/IIIT5k_3000
|
| 173 |
+
- ../evaluation/SVT
|
| 174 |
+
- ../evaluation/SVTP
|
| 175 |
+
transforms:
|
| 176 |
+
- DecodeImagePIL:
|
| 177 |
+
img_mode: RGB
|
| 178 |
+
- ARLabelEncode:
|
| 179 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 180 |
+
use_space_char: false
|
| 181 |
+
max_text_length: 25
|
| 182 |
+
- KeepKeys:
|
| 183 |
+
keep_keys:
|
| 184 |
+
- image
|
| 185 |
+
- label
|
| 186 |
+
- length
|
| 187 |
+
sampler:
|
| 188 |
+
name: RatioSampler
|
| 189 |
+
scales:
|
| 190 |
+
- - 128
|
| 191 |
+
- 32
|
| 192 |
+
first_bs: 256
|
| 193 |
+
fix_bs: false
|
| 194 |
+
divided_factor:
|
| 195 |
+
- 4
|
| 196 |
+
- 16
|
| 197 |
+
is_training: false
|
| 198 |
+
loader:
|
| 199 |
+
shuffle: false
|
| 200 |
+
drop_last: false
|
| 201 |
+
batch_size_per_card: 256
|
| 202 |
+
max_ratio: 4
|
| 203 |
+
num_workers: 4
|
| 204 |
+
filename: svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3_syn
|
| 205 |
+
config: configs/rec/llada/svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3_syn.yml
|
| 206 |
+
local_rank: '5'
|
| 207 |
+
eval: true
|
mdiff4str_base_syn/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mdiff4str_small/best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebc96f11e5739eae98491f7c8c140157ed12b35dafa220511ef9d399cc20a937
|
| 3 |
+
size 83324786
|
mdiff4str_small/config.yml
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 40
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_llada_test_lr00005_fs_reflect_onedecoder_semiar_fs_sample3_3633/
|
| 7 |
+
save_epoch_step:
|
| 8 |
+
- 10
|
| 9 |
+
- 1
|
| 10 |
+
eval_batch_step:
|
| 11 |
+
- 0
|
| 12 |
+
- 500
|
| 13 |
+
eval_epoch_step:
|
| 14 |
+
- 0
|
| 15 |
+
- 1
|
| 16 |
+
cal_metric_during_train: false
|
| 17 |
+
pretrained_model: null
|
| 18 |
+
checkpoints: null
|
| 19 |
+
use_tensorboard: false
|
| 20 |
+
infer_img: ./img_44_1.jpg
|
| 21 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 22 |
+
max_text_length: 25
|
| 23 |
+
use_space_char: false
|
| 24 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_nrtr.txt
|
| 25 |
+
use_amp: true
|
| 26 |
+
grad_clip_val: 20.0
|
| 27 |
+
distributed: true
|
| 28 |
+
Optimizer:
|
| 29 |
+
name: AdamW
|
| 30 |
+
lr: 0.0005
|
| 31 |
+
weight_decay: 0.05
|
| 32 |
+
filter_bias_and_bn: true
|
| 33 |
+
LRScheduler:
|
| 34 |
+
name: OneCycleLR
|
| 35 |
+
warmup_epoch: 1.5
|
| 36 |
+
cycle_momentum: false
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: NRTR
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform: null
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRv2LNConvTwo33
|
| 44 |
+
use_pos_embed: false
|
| 45 |
+
dims:
|
| 46 |
+
- 128
|
| 47 |
+
- 256
|
| 48 |
+
- 384
|
| 49 |
+
depths:
|
| 50 |
+
- 3
|
| 51 |
+
- 6
|
| 52 |
+
- 3
|
| 53 |
+
num_heads:
|
| 54 |
+
- 4
|
| 55 |
+
- 8
|
| 56 |
+
- 12
|
| 57 |
+
mixer:
|
| 58 |
+
- - Conv
|
| 59 |
+
- Conv
|
| 60 |
+
- Conv
|
| 61 |
+
- - Conv
|
| 62 |
+
- Conv
|
| 63 |
+
- FGlobal
|
| 64 |
+
- Global
|
| 65 |
+
- Global
|
| 66 |
+
- Global
|
| 67 |
+
- - Global
|
| 68 |
+
- Global
|
| 69 |
+
- Global
|
| 70 |
+
local_k:
|
| 71 |
+
- - 5
|
| 72 |
+
- 5
|
| 73 |
+
- - 5
|
| 74 |
+
- 5
|
| 75 |
+
- - -1
|
| 76 |
+
- -1
|
| 77 |
+
sub_k:
|
| 78 |
+
- - 1
|
| 79 |
+
- 1
|
| 80 |
+
- - 2
|
| 81 |
+
- 1
|
| 82 |
+
- - -1
|
| 83 |
+
- -1
|
| 84 |
+
last_stage: false
|
| 85 |
+
feat2d: false
|
| 86 |
+
Decoder:
|
| 87 |
+
name: MDiffDecoder
|
| 88 |
+
num_decoder_layers: 3
|
| 89 |
+
nhead: 6
|
| 90 |
+
max_len: 25
|
| 91 |
+
parallel_decoding: false
|
| 92 |
+
autoregressive_decoding: false
|
| 93 |
+
low_confidence_decoding: false
|
| 94 |
+
random_mask_decoding: false
|
| 95 |
+
semi_autoregressive_decoding: true
|
| 96 |
+
cloze_mask_decoding: false
|
| 97 |
+
sampler_step: 3
|
| 98 |
+
sample_k: 3
|
| 99 |
+
Loss:
|
| 100 |
+
name: NoneLoss
|
| 101 |
+
PostProcess:
|
| 102 |
+
name: ARLabelDecode
|
| 103 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 104 |
+
use_space_char: false
|
| 105 |
+
Metric:
|
| 106 |
+
name: RecMetric
|
| 107 |
+
main_indicator: acc
|
| 108 |
+
is_filter: true
|
| 109 |
+
Train:
|
| 110 |
+
dataset:
|
| 111 |
+
name: RatioDataSetTVResize
|
| 112 |
+
ds_width: true
|
| 113 |
+
padding: false
|
| 114 |
+
data_dir_list:
|
| 115 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_challenging
|
| 116 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_hard
|
| 117 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_medium
|
| 118 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_normal
|
| 119 |
+
- ../Union14M-L-LMDB-Filtered/filter_train_easy
|
| 120 |
+
transforms:
|
| 121 |
+
- DecodeImagePIL:
|
| 122 |
+
img_mode: RGB
|
| 123 |
+
- PARSeqAugPIL: null
|
| 124 |
+
- LLaDALabelEncode:
|
| 125 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 126 |
+
use_space_char: false
|
| 127 |
+
max_text_length: 25
|
| 128 |
+
train_all_layer: true
|
| 129 |
+
sample_num: 3
|
| 130 |
+
- KeepKeys:
|
| 131 |
+
keep_keys:
|
| 132 |
+
- image
|
| 133 |
+
- label
|
| 134 |
+
- reflect_ids
|
| 135 |
+
- noisy_batch
|
| 136 |
+
- masked_indices
|
| 137 |
+
- p_mask
|
| 138 |
+
- length
|
| 139 |
+
sampler:
|
| 140 |
+
name: RatioSampler
|
| 141 |
+
scales:
|
| 142 |
+
- - 128
|
| 143 |
+
- 32
|
| 144 |
+
first_bs: 256
|
| 145 |
+
fix_bs: false
|
| 146 |
+
divided_factor:
|
| 147 |
+
- 4
|
| 148 |
+
- 16
|
| 149 |
+
is_training: true
|
| 150 |
+
loader:
|
| 151 |
+
shuffle: true
|
| 152 |
+
batch_size_per_card: 256
|
| 153 |
+
drop_last: true
|
| 154 |
+
max_ratio: 4
|
| 155 |
+
num_workers: 4
|
| 156 |
+
Eval:
|
| 157 |
+
dataset:
|
| 158 |
+
name: RatioDataSetTVResize
|
| 159 |
+
ds_width: true
|
| 160 |
+
padding: false
|
| 161 |
+
data_dir_list:
|
| 162 |
+
- ../evaluation/CUTE80
|
| 163 |
+
- ../evaluation/IC13_857
|
| 164 |
+
- ../evaluation/IC15_1811
|
| 165 |
+
- ../evaluation/IIIT5k_3000
|
| 166 |
+
- ../evaluation/SVT
|
| 167 |
+
- ../evaluation/SVTP
|
| 168 |
+
transforms:
|
| 169 |
+
- DecodeImagePIL:
|
| 170 |
+
img_mode: RGB
|
| 171 |
+
- ARLabelEncode:
|
| 172 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
| 173 |
+
use_space_char: false
|
| 174 |
+
max_text_length: 25
|
| 175 |
+
- KeepKeys:
|
| 176 |
+
keep_keys:
|
| 177 |
+
- image
|
| 178 |
+
- label
|
| 179 |
+
- length
|
| 180 |
+
sampler:
|
| 181 |
+
name: RatioSampler
|
| 182 |
+
scales:
|
| 183 |
+
- - 128
|
| 184 |
+
- 32
|
| 185 |
+
first_bs: 256
|
| 186 |
+
fix_bs: false
|
| 187 |
+
divided_factor:
|
| 188 |
+
- 4
|
| 189 |
+
- 16
|
| 190 |
+
is_training: false
|
| 191 |
+
loader:
|
| 192 |
+
shuffle: false
|
| 193 |
+
drop_last: false
|
| 194 |
+
batch_size_per_card: 256
|
| 195 |
+
max_ratio: 4
|
| 196 |
+
num_workers: 4
|
| 197 |
+
filename: svtrv2_llada_fs_reflect_onedecoder_fs_sample3_3633
|
| 198 |
+
config: configs/rec/llada/svtrv2_llada_fs_reflect_onedecoder_fs_sample3_3633.yml
|
| 199 |
+
local_rank: '1'
|
| 200 |
+
eval: true
|
mdiff4str_small/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mdiff4str_small_ch/best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a587f6a7322aa26b064da83e5b04cdbb38afe47f92b39431b2eb9ab77d490546
|
| 3 |
+
size 103381810
|
mdiff4str_small_ch/config.yml
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Global:
|
| 2 |
+
device: gpu
|
| 3 |
+
epoch_num: 100
|
| 4 |
+
log_smooth_window: 20
|
| 5 |
+
print_batch_step: 10
|
| 6 |
+
output_dir: ./output/rec/ch/svtrv2_llada_test_lr00005_fs_reflect_onedecoder_semiar_withoutmask_sample3_3633_ch/
|
| 7 |
+
save_epoch_step:
|
| 8 |
+
- 10
|
| 9 |
+
- 1
|
| 10 |
+
eval_batch_step:
|
| 11 |
+
- 0
|
| 12 |
+
- 2000
|
| 13 |
+
eval_epoch_step:
|
| 14 |
+
- 0
|
| 15 |
+
- 1
|
| 16 |
+
cal_metric_during_train: false
|
| 17 |
+
pretrained_model: null
|
| 18 |
+
checkpoints: null
|
| 19 |
+
use_tensorboard: false
|
| 20 |
+
infer_img: ./img_44_1.jpg
|
| 21 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 22 |
+
max_text_length: 25
|
| 23 |
+
use_space_char: false
|
| 24 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_nrtr.txt
|
| 25 |
+
use_amp: true
|
| 26 |
+
grad_clip_val: 20.0
|
| 27 |
+
distributed: true
|
| 28 |
+
Optimizer:
|
| 29 |
+
name: AdamW
|
| 30 |
+
lr: 0.0005
|
| 31 |
+
weight_decay: 0.05
|
| 32 |
+
filter_bias_and_bn: true
|
| 33 |
+
LRScheduler:
|
| 34 |
+
name: OneCycleLR
|
| 35 |
+
warmup_epoch: 5
|
| 36 |
+
cycle_momentum: false
|
| 37 |
+
Architecture:
|
| 38 |
+
model_type: rec
|
| 39 |
+
algorithm: NRTR
|
| 40 |
+
in_channels: 3
|
| 41 |
+
Transform: null
|
| 42 |
+
Encoder:
|
| 43 |
+
name: SVTRv2LNConvTwo33
|
| 44 |
+
use_pos_embed: false
|
| 45 |
+
dims:
|
| 46 |
+
- 128
|
| 47 |
+
- 256
|
| 48 |
+
- 384
|
| 49 |
+
depths:
|
| 50 |
+
- 3
|
| 51 |
+
- 6
|
| 52 |
+
- 3
|
| 53 |
+
num_heads:
|
| 54 |
+
- 4
|
| 55 |
+
- 8
|
| 56 |
+
- 12
|
| 57 |
+
mixer:
|
| 58 |
+
- - Conv
|
| 59 |
+
- Conv
|
| 60 |
+
- Conv
|
| 61 |
+
- Conv
|
| 62 |
+
- Conv
|
| 63 |
+
- Conv
|
| 64 |
+
- - Conv
|
| 65 |
+
- Conv
|
| 66 |
+
- FGlobal
|
| 67 |
+
- Global
|
| 68 |
+
- Global
|
| 69 |
+
- Global
|
| 70 |
+
- - Global
|
| 71 |
+
- Global
|
| 72 |
+
- Global
|
| 73 |
+
- Global
|
| 74 |
+
- Global
|
| 75 |
+
- Global
|
| 76 |
+
local_k:
|
| 77 |
+
- - 5
|
| 78 |
+
- 5
|
| 79 |
+
- - 5
|
| 80 |
+
- 5
|
| 81 |
+
- - -1
|
| 82 |
+
- -1
|
| 83 |
+
sub_k:
|
| 84 |
+
- - 1
|
| 85 |
+
- 1
|
| 86 |
+
- - 2
|
| 87 |
+
- 1
|
| 88 |
+
- - -1
|
| 89 |
+
- -1
|
| 90 |
+
last_stage: false
|
| 91 |
+
feat2d: false
|
| 92 |
+
Decoder:
|
| 93 |
+
name: MDiffDecoder
|
| 94 |
+
num_decoder_layers: 3
|
| 95 |
+
nhead: 6
|
| 96 |
+
max_len: 25
|
| 97 |
+
parallel_decoding: false
|
| 98 |
+
autoregressive_decoding: false
|
| 99 |
+
low_confidence_decoding: false
|
| 100 |
+
random_mask_decoding: false
|
| 101 |
+
semi_autoregressive_decoding: true
|
| 102 |
+
cloze_mask_decoding: false
|
| 103 |
+
sampler_step: 3
|
| 104 |
+
sample_k: 3
|
| 105 |
+
Loss:
|
| 106 |
+
name: NoneLoss
|
| 107 |
+
PostProcess:
|
| 108 |
+
name: ARLabelDecode
|
| 109 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 110 |
+
use_space_char: false
|
| 111 |
+
Metric:
|
| 112 |
+
name: RecMetric
|
| 113 |
+
main_indicator: acc
|
| 114 |
+
is_filter: false
|
| 115 |
+
Train:
|
| 116 |
+
dataset:
|
| 117 |
+
name: RatioDataSetTVResize
|
| 118 |
+
ds_width: true
|
| 119 |
+
padding: false
|
| 120 |
+
data_dir_list:
|
| 121 |
+
- ../benchmark_bctr/benchmark_bctr_train/document_train
|
| 122 |
+
- ../benchmark_bctr/benchmark_bctr_train/handwriting_train
|
| 123 |
+
- ../benchmark_bctr/benchmark_bctr_train/scene_train
|
| 124 |
+
- ../benchmark_bctr/benchmark_bctr_train/web_train
|
| 125 |
+
transforms:
|
| 126 |
+
- DecodeImagePIL:
|
| 127 |
+
img_mode: RGB
|
| 128 |
+
- PARSeqAugPIL: null
|
| 129 |
+
- LLaDALabelEncode:
|
| 130 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 131 |
+
use_space_char: false
|
| 132 |
+
max_text_length: 25
|
| 133 |
+
train_all_layer: true
|
| 134 |
+
sample_num: 3
|
| 135 |
+
- KeepKeys:
|
| 136 |
+
keep_keys:
|
| 137 |
+
- image
|
| 138 |
+
- label
|
| 139 |
+
- reflect_ids
|
| 140 |
+
- noisy_batch
|
| 141 |
+
- masked_indices
|
| 142 |
+
- p_mask
|
| 143 |
+
- length
|
| 144 |
+
sampler:
|
| 145 |
+
name: RatioSampler
|
| 146 |
+
scales:
|
| 147 |
+
- - 128
|
| 148 |
+
- 32
|
| 149 |
+
first_bs: 128
|
| 150 |
+
fix_bs: false
|
| 151 |
+
divided_factor:
|
| 152 |
+
- 4
|
| 153 |
+
- 16
|
| 154 |
+
is_training: true
|
| 155 |
+
loader:
|
| 156 |
+
shuffle: true
|
| 157 |
+
batch_size_per_card: 128
|
| 158 |
+
drop_last: true
|
| 159 |
+
max_ratio: 8
|
| 160 |
+
num_workers: 4
|
| 161 |
+
Eval:
|
| 162 |
+
dataset:
|
| 163 |
+
name: RatioDataSetTVResize
|
| 164 |
+
ds_width: true
|
| 165 |
+
padding: false
|
| 166 |
+
data_dir_list:
|
| 167 |
+
- ../benchmark_bctr/benchmark_bctr_test/scene_test
|
| 168 |
+
transforms:
|
| 169 |
+
- DecodeImagePIL:
|
| 170 |
+
img_mode: RGB
|
| 171 |
+
- ARLabelEncode:
|
| 172 |
+
character_dict_path: ./tools/utils/ppocr_keys_v1.txt
|
| 173 |
+
use_space_char: false
|
| 174 |
+
max_text_length: 25
|
| 175 |
+
- KeepKeys:
|
| 176 |
+
keep_keys:
|
| 177 |
+
- image
|
| 178 |
+
- label
|
| 179 |
+
- length
|
| 180 |
+
sampler:
|
| 181 |
+
name: RatioSampler
|
| 182 |
+
scales:
|
| 183 |
+
- - 128
|
| 184 |
+
- 32
|
| 185 |
+
first_bs: 128
|
| 186 |
+
fix_bs: false
|
| 187 |
+
divided_factor:
|
| 188 |
+
- 4
|
| 189 |
+
- 16
|
| 190 |
+
is_training: false
|
| 191 |
+
loader:
|
| 192 |
+
shuffle: false
|
| 193 |
+
drop_last: false
|
| 194 |
+
batch_size_per_card: 128
|
| 195 |
+
max_ratio: 8
|
| 196 |
+
num_workers: 4
|
| 197 |
+
filename: svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3_ch_3633
|
| 198 |
+
config: configs/rec/llada/svtrv2_llada_fs_reflect_onedecoder_reflectwithoutmask_sample3_ch_3633.yml
|
| 199 |
+
local_rank: '4'
|
| 200 |
+
eval: true
|
mdiff4str_small_ch/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
readme.md
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MDiff4STR
|
| 2 |
+
|
| 3 |
+
- [MDiff4STR](#mdiff4str)
|
| 4 |
+
- [1. Introduction](#1-introduction)
|
| 5 |
+
- [1.1 Models and Results](#11-models-and-results)
|
| 6 |
+
- [2. Environment](#2-environment)
|
| 7 |
+
- [3. Model Training / Evaluation](#3-model-training--evaluation)
|
| 8 |
+
- [Dataset Preparation](#dataset-preparation)
|
| 9 |
+
- [Training](#training)
|
| 10 |
+
- [Evaluation](#evaluation)
|
| 11 |
+
- [Inference](#inference)
|
| 12 |
+
- [Latency Measurement](#latency-measurement)
|
| 13 |
+
- [Citation](#citation)
|
| 14 |
+
|
| 15 |
+
<a name="1"></a>
|
| 16 |
+
|
| 17 |
+
## 1. Introduction
|
| 18 |
+
|
| 19 |
+
Paper:
|
| 20 |
+
|
| 21 |
+
> [MDiff4STR: Mask Diffusion Model for Scene Text Recognition](https://arxiv.org/abs/2512.01422)
|
| 22 |
+
> Yongkun Du, Miaomiao Zhao, Songlin Fan, Zhineng Chen\*, Caiyan Jia, Yu-Gang Jiang
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
<a name="model"></a>
|
| 26 |
+
Mask Diffusion Models (MDMs) have recently emerged as a promising alternative to auto-regressive models (ARMs) for vision-language tasks, owing to their flexible balance of efficiency and accuracy. In this paper, for the first time, we introduce MDMs into the Scene Text Recognition (STR) task. We show that vanilla MDM lags behind ARMs in terms of accuracy, although it improves recognition efficiency. To bridge this gap, we propose MDiff4STR, a Mask Diffusion model enhanced with two key improvement strategies tailored for STR. Specifically, we identify two key challenges in applying MDMs to STR: noising gap between training and inference, and overconfident predictions during inference. Both significantly hinder the performance of MDMs. To mitigate the first issue, we develop six noising strategies that better align training with inference behavior. For the second, we propose a token-replacement noise mechanism that provides a non-mask noise type, encouraging the model to reconsider and revise overly confident but incorrect predictions. We conduct extensive evaluations of MDiff4STR on both standard and challenging STR benchmarks, covering diverse scenarios including irregular, artistic, occluded, and Chinese text, as well as whether the use of pretraining. Across these settings, MDiff4STR consistently outperforms popular STR models, surpassing state-of-the-art ARMs in accuracy, while maintaining fast inference with only three denoising steps.
|
| 27 |
+
|
| 28 |
+
### 1.1 Models and Results
|
| 29 |
+
|
| 30 |
+
The accuracy (%) and model files of MDiff4STR on the public dataset of scene text recognition are as follows:
|
| 31 |
+
|
| 32 |
+
Download all Configs, Models, and Logs from [HuggingFace Model](https://huggingface.co/topdu/MDiff4STR).
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
- Test on Common Benchmarks from [PARSeq](https://github.com/baudm/parseq):
|
| 36 |
+
|
| 37 |
+
| Model | Training Data | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
|
| 38 |
+
| :------: | :----------------------------------------------------------: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :-----------------------------------------------------------------------: |
|
| 39 |
+
| MDiff4STR-B | Synthetic datasets (MJ+ST) | 97.7 | 94.0 | 97.3 | 88.1 | 91.2 | 95.8 | 94.02 | TODO |
|
| 40 |
+
| MDiff4STR-S | [Union14M-L-Filter](../../../docs/svtrv2.md#dataset-details) | 99.0 | 98.3 | 98.5 | 89.5 | 92.9 | 98.6 | 96.13 | [HuggingFace Model](https://huggingface.co/topdu/MDiff4STR) |
|
| 41 |
+
| MDiff4STR-B | [Union14M-L-Filter](../../../docs/svtrv2.md#dataset-details) | 99.2 | 98.0 | 98.7 | 91.1 | 93.5 | 99.0 | 96.57 | [HuggingFace Model](https://huggingface.co/topdu/MDiff4STR) |
|
| 42 |
+
|
| 43 |
+
- Test on Union14M-L benchmark from [Union14M](https://github.com/Mountchicken/Union14M/).
|
| 44 |
+
|
| 45 |
+
| Model | Traing Data | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
|
| 46 |
+
| :------: | :----------------------------------------------------------: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
|
| 47 |
+
| MDiff4STR-B | Synthetic datasets (MJ+ST) | 74.6 | 25.2 | 57.6 | 69.7 | 77.9 | 68.0 | 66.9 | 62.83 | Same as the above table |
|
| 48 |
+
| MDiff4STR-S | [Union14M-L-Filter](../../../docs/svtrv2.md#dataset-details) | 88.3 | 84.6 | 76.5 | 84.3 | 83.3 | 85.4 | 83.5 | 83.70 | Same as the above table |
|
| 49 |
+
| MDiff4STR-B | [Union14M-L-Filter](../../../docs/svtrv2.md#dataset-details) | 90.6 | 89.0 | 79.3 | 86.1 | 86.2 | 86.7 | 85.1 | 86.14 | Same as the above table |
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
- Training and test on Chinese dataset, from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
|
| 53 |
+
|
| 54 |
+
| Model | Scene | Web | Document | Handwriting | Avg | Config&Model&Log |
|
| 55 |
+
| :------: | :---: | :--: | :------: | :---------: | :---: | :-----------------------------------------------------------------------------------------------------: |
|
| 56 |
+
| MDiff4STR-S | 81.1 | 81.2 | 99.3 | 65.0 | 81.64 | [Google drive](https://drive.google.com/drive/folders/1X3hqArfvRIRtuYLHDtSQheQmDc_oXpY6?usp=drive_link) |
|
| 57 |
+
| MDiff4STR-B | 83.5 | 83.3 | 99.5 | 67.0 | 83.31 | [Google drive](https://drive.google.com/drive/folders/1ZDECKXf8zZFhcKKKpvicg43Ho85uDZkF?usp=drive_link) |
|
| 58 |
+
|
| 59 |
+
<a name="2"></a>
|
| 60 |
+
|
| 61 |
+
## 2. Environment
|
| 62 |
+
|
| 63 |
+
- [PyTorch](http://pytorch.org/) version >= 1.13.0
|
| 64 |
+
- Python version >= 3.7
|
| 65 |
+
|
| 66 |
+
```shell
|
| 67 |
+
git clone -b develop https://github.com/Topdu/OpenOCR.git
|
| 68 |
+
cd OpenOCR
|
| 69 |
+
# Ubuntu 20.04 Cuda 11.8
|
| 70 |
+
conda create -n openocr python==3.8
|
| 71 |
+
conda activate openocr
|
| 72 |
+
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
|
| 73 |
+
pip install -r requirements.txt
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
<a name="3"></a>
|
| 77 |
+
|
| 78 |
+
## 3. Model Training / Evaluation
|
| 79 |
+
|
| 80 |
+
### Dataset Preparation
|
| 81 |
+
|
| 82 |
+
Referring to [Downloading Datasets](../../../docs/svtrv2.md#downloading-datasets)
|
| 83 |
+
|
| 84 |
+
### Training
|
| 85 |
+
|
| 86 |
+
```shell
|
| 87 |
+
# First stage
|
| 88 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 tools/train_rec.py --c configs/rec/svtrv2/svtrv2_rctc.yml
|
| 89 |
+
|
| 90 |
+
# Second stage
|
| 91 |
+
CUDA_VISIBLE_DEVICES=4,5,6,7 python -m torch.distributed.launch --master_port=23332 --nproc_per_node=4 tools/train_rec.py --c configs/rec/svtrv2/svtrv2_smtr_gtc_rctc.yml --o Global.pretrained_model=./output/rec/u14m_filter/svtrv2_rctc/best.pth
|
| 92 |
+
|
| 93 |
+
# For Multi RTX 4090
|
| 94 |
+
NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --master_port=23333 --nproc_per_node=4 tools/train_rec.py --c configs/rec/svtrv2/svtrv2_rctc.yml
|
| 95 |
+
# 20epoch runs for about 6 hours
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
### Evaluation
|
| 99 |
+
|
| 100 |
+
```shell
|
| 101 |
+
# short text: Common, Union14M-Benchmark, OST
|
| 102 |
+
python tools/eval_rec_all_en.py --c configs/rec/svtrv2/svtrv2_smtr_gtc_rctc_infer.yml
|
| 103 |
+
|
| 104 |
+
# long text: LTB
|
| 105 |
+
python tools/eval_rec_all_long.py --c configs/rec/svtrv2/svtrv2_smtr_gtc_rctc_infer.yml --o Eval.loader.max_ratio=20
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
After a successful run, the results are saved in a csv file in `output_dir` in the config file.
|
| 109 |
+
|
| 110 |
+
### Inference
|
| 111 |
+
|
| 112 |
+
```shell
|
| 113 |
+
python tools/infer_rec.py --c configs/rec/svtrv2/svtrv2_smtr_gtc_rctc_infer.yml --o Global.infer_img=/path/img_fold or /path/img_file
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### Latency Measurement
|
| 117 |
+
|
| 118 |
+
Firstly, downloading the IIIT5K images from [Google Drive](https://drive.google.com/drive/folders/1Po1LSBQb87DxGJuAgLNxhsJ-pdXxpIfS?usp=drive_link). Then, running the following command:
|
| 119 |
+
|
| 120 |
+
```shell
|
| 121 |
+
python tools/infer_rec.py --c configs/rec/SVTRv2/svtrv2_smtr_gtc_rctc_infer.yml --o Global.infer_img=../iiit5k_test_image
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
## Citation
|
| 125 |
+
|
| 126 |
+
If you find our method useful for your reserach, please cite:
|
| 127 |
+
|
| 128 |
+
```bibtex
|
| 129 |
+
@inproceedings{Du2025MDiff5STR,
|
| 130 |
+
title={MDiff4STR: Mask Diffusion Model for Scene Text Recognition},
|
| 131 |
+
author={Yongkun Du and Miaomiao Zhao and Songlin Fan and Zhineng Chen and Caiyan Jia and Yu-Gang Jiang},
|
| 132 |
+
booktitle={AAAI Oral},
|
| 133 |
+
year={2025},
|
| 134 |
+
}
|
| 135 |
+
```
|