sergabrr commited on
Commit
6875c03
·
verified ·
1 Parent(s): 84354c6

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ - ru
6
+ metrics:
7
+ - accuracy
8
+ - f1
9
+ - recall
10
+ library_name: transformers
11
+ pipeline_tag: sentence-similarity
12
+ tags:
13
+ - mteb
14
+ - retrieval
15
+ - retriever
16
+ - pruned
17
+ - e5
18
+ - sentence-transformers
19
+ - feature-extraction
20
+ - sentence-similarity
21
+ model-index:
22
+ - name: e5-large-en-ru
23
+ results:
24
+ - task:
25
+ type: Classification
26
+ dataset:
27
+ type: mteb/amazon_counterfactual
28
+ name: MTEB AmazonCounterfactualClassification (en)
29
+ config: en
30
+ split: test
31
+ revision: e8379541af4e31359cca9fbcf4b00f2671dba205
32
+ metrics:
33
+ - type: accuracy
34
+ value: 79.5671641791045
35
+ - type: ap
36
+ value: 44.011060753169424
37
+ - type: f1
38
+ value: 73.76504135120175
39
+ - task:
40
+ type: Reranking
41
+ dataset:
42
+ type: mteb/askubuntudupquestions-reranking
43
+ name: MTEB AskUbuntuDupQuestions
44
+ config: default
45
+ split: test
46
+ revision: 2000358ca161889fa9c082cb41daa8dcfb161a54
47
+ metrics:
48
+ - type: map
49
+ value: 57.69669466706412
50
+ - type: mrr
51
+ value: 70.61370531592138
52
+ - task:
53
+ type: STS
54
+ dataset:
55
+ type: mteb/biosses-sts
56
+ name: MTEB BIOSSES
57
+ config: default
58
+ split: test
59
+ revision: d3fb88f8f02e40887cd149695127462bbcf29b4a
60
+ metrics:
61
+ - type: cos_sim_pearson
62
+ value: 86.36465960226795
63
+ - type: cos_sim_spearman
64
+ value: 84.57602350761223
65
+ - type: euclidean_pearson
66
+ value: 84.31391364490506
67
+ - type: euclidean_spearman
68
+ value: 84.57602350761223
69
+ - type: manhattan_pearson
70
+ value: 84.15796224236456
71
+ - type: manhattan_spearman
72
+ value: 84.3645729064343
73
+ - task:
74
+ type: Reranking
75
+ dataset:
76
+ type: mteb/mind_small
77
+ name: MTEB MindSmallReranking
78
+ config: default
79
+ split: test
80
+ revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69
81
+ metrics:
82
+ - type: map
83
+ value: 31.105698873583098
84
+ - type: mrr
85
+ value: 32.163780846856206
86
+ - task:
87
+ type: STS
88
+ dataset:
89
+ type: mteb/sickr-sts
90
+ name: MTEB SICK-R
91
+ config: default
92
+ split: test
93
+ revision: a6ea5a8cab320b040a23452cc28066d9beae2cee
94
+ metrics:
95
+ - type: cos_sim_pearson
96
+ value: 83.75973907678062
97
+ - type: cos_sim_spearman
98
+ value: 80.54994608351296
99
+ - type: euclidean_pearson
100
+ value: 80.58496551316748
101
+ - type: euclidean_spearman
102
+ value: 80.54993996457814
103
+ - type: manhattan_pearson
104
+ value: 80.49280884070782
105
+ - type: manhattan_spearman
106
+ value: 80.41230093993471
107
+ - task:
108
+ type: STS
109
+ dataset:
110
+ type: mteb/sts12-sts
111
+ name: MTEB STS12
112
+ config: default
113
+ split: test
114
+ revision: a0d554a64d88156834ff5ae9920b964011b16384
115
+ metrics:
116
+ - type: cos_sim_pearson
117
+ value: 87.345503928209
118
+ - type: cos_sim_spearman
119
+ value: 80.4634619001261
120
+ - type: euclidean_pearson
121
+ value: 84.2666575030677
122
+ - type: euclidean_spearman
123
+ value: 80.46347579495351
124
+ - type: manhattan_pearson
125
+ value: 84.14370038922885
126
+ - type: manhattan_spearman
127
+ value: 80.36565043629274
128
+ - task:
129
+ type: STS
130
+ dataset:
131
+ type: mteb/sts13-sts
132
+ name: MTEB STS13
133
+ config: default
134
+ split: test
135
+ revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca
136
+ metrics:
137
+ - type: cos_sim_pearson
138
+ value: 75.14644787456163
139
+ - type: cos_sim_spearman
140
+ value: 75.88443166051762
141
+ - type: euclidean_pearson
142
+ value: 76.19117255044588
143
+ - type: euclidean_spearman
144
+ value: 75.88443166051762
145
+ - type: manhattan_pearson
146
+ value: 76.00450128624708
147
+ - type: manhattan_spearman
148
+ value: 75.69943934692938
149
+ - task:
150
+ type: STS
151
+ dataset:
152
+ type: mteb/sts14-sts
153
+ name: MTEB STS14
154
+ config: default
155
+ split: test
156
+ revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375
157
+ metrics:
158
+ - type: cos_sim_pearson
159
+ value: 77.60763524019471
160
+ - type: cos_sim_spearman
161
+ value: 77.2591077818027
162
+ - type: euclidean_pearson
163
+ value: 77.14021401348042
164
+ - type: euclidean_spearman
165
+ value: 77.25911027186999
166
+ - type: manhattan_pearson
167
+ value: 76.87139081109731
168
+ - type: manhattan_spearman
169
+ value: 76.98379627773018
170
+ - task:
171
+ type: STS
172
+ dataset:
173
+ type: mteb/sts15-sts
174
+ name: MTEB STS15
175
+ config: default
176
+ split: test
177
+ revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3
178
+ metrics:
179
+ - type: cos_sim_pearson
180
+ value: 88.18321035966198
181
+ - type: cos_sim_spearman
182
+ value: 89.0469892725742
183
+ - type: euclidean_pearson
184
+ value: 88.05085809092137
185
+ - type: euclidean_spearman
186
+ value: 89.04698194601134
187
+ - type: manhattan_pearson
188
+ value: 88.03620967628684
189
+ - type: manhattan_spearman
190
+ value: 89.02859425307943
191
+ - task:
192
+ type: STS
193
+ dataset:
194
+ type: mteb/sts16-sts
195
+ name: MTEB STS16
196
+ config: default
197
+ split: test
198
+ revision: 4d8694f8f0e0100860b497b999b3dbed754a0513
199
+ metrics:
200
+ - type: cos_sim_pearson
201
+ value: 82.39166503459249
202
+ - type: cos_sim_spearman
203
+ value: 83.71826060604693
204
+ - type: euclidean_pearson
205
+ value: 82.70145770530107
206
+ - type: euclidean_spearman
207
+ value: 83.71826045549452
208
+ - type: manhattan_pearson
209
+ value: 82.56870669205291
210
+ - type: manhattan_spearman
211
+ value: 83.55353737670136
212
+ - task:
213
+ type: STS
214
+ dataset:
215
+ type: mteb/sts17-crosslingual-sts
216
+ name: MTEB STS17 (en-en)
217
+ config: en-en
218
+ split: test
219
+ revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
220
+ metrics:
221
+ - type: cos_sim_pearson
222
+ value: 89.58290721169323
223
+ - type: cos_sim_spearman
224
+ value: 89.25956993522081
225
+ - type: euclidean_pearson
226
+ value: 89.4716703635447
227
+ - type: euclidean_spearman
228
+ value: 89.25956993522081
229
+ - type: manhattan_pearson
230
+ value: 89.4475864648432
231
+ - type: manhattan_spearman
232
+ value: 89.14694174575615
233
+ - task:
234
+ type: Reranking
235
+ dataset:
236
+ type: mteb/scidocs-reranking
237
+ name: MTEB SciDocsRR
238
+ config: default
239
+ split: test
240
+ revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab
241
+ metrics:
242
+ - type: map
243
+ value: 81.4879065181404
244
+ - type: mrr
245
+ value: 94.81295937178291
246
+ - task:
247
+ type: PairClassification
248
+ dataset:
249
+ type: mteb/sprintduplicatequestions-pairclassification
250
+ name: MTEB SprintDuplicateQuestions
251
+ config: default
252
+ split: test
253
+ revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46
254
+ metrics:
255
+ - type: cos_sim_accuracy
256
+ value: 99.73960396039604
257
+ - type: cos_sim_ap
258
+ value: 92.70840767967965
259
+ - type: cos_sim_f1
260
+ value: 86.90890990542557
261
+ - type: cos_sim_precision
262
+ value: 86.5213082259663
263
+ - type: cos_sim_recall
264
+ value: 87.3
265
+ - type: dot_accuracy
266
+ value: 99.73960396039604
267
+ - type: dot_ap
268
+ value: 92.70828452993575
269
+ - type: dot_f1
270
+ value: 86.90890990542557
271
+ - type: dot_precision
272
+ value: 86.5213082259663
273
+ - type: dot_recall
274
+ value: 87.3
275
+ - type: euclidean_accuracy
276
+ value: 99.73960396039604
277
+ - type: euclidean_ap
278
+ value: 92.7084093403562
279
+ - type: euclidean_f1
280
+ value: 86.90890990542557
281
+ - type: euclidean_precision
282
+ value: 86.5213082259663
283
+ - type: euclidean_recall
284
+ value: 87.3
285
+ - type: manhattan_accuracy
286
+ value: 99.74059405940594
287
+ - type: manhattan_ap
288
+ value: 92.7406819850299
289
+ - type: manhattan_f1
290
+ value: 87.01234567901234
291
+ - type: manhattan_precision
292
+ value: 85.95121951219512
293
+ - type: manhattan_recall
294
+ value: 88.1
295
+ - type: max_accuracy
296
+ value: 99.74059405940594
297
+ - type: max_ap
298
+ value: 92.7406819850299
299
+ - type: max_f1
300
+ value: 87.01234567901234
301
+ - task:
302
+ type: Reranking
303
+ dataset:
304
+ type: mteb/stackoverflowdupquestions-reranking
305
+ name: MTEB StackOverflowDupQuestions
306
+ config: default
307
+ split: test
308
+ revision: e185fbe320c72810689fc5848eb6114e1ef5ec69
309
+ metrics:
310
+ - type: map
311
+ value: 48.566931484512196
312
+ - type: mrr
313
+ value: 49.23111100500807
314
+ - task:
315
+ type: PairClassification
316
+ dataset:
317
+ type: mteb/twittersemeval2015-pairclassification
318
+ name: MTEB TwitterSemEval2015
319
+ config: default
320
+ split: test
321
+ revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1
322
+ metrics:
323
+ - type: cos_sim_accuracy
324
+ value: 86.27287357692079
325
+ - type: cos_sim_ap
326
+ value: 74.20855854505362
327
+ - type: cos_sim_f1
328
+ value: 69.09903201787044
329
+ - type: cos_sim_precision
330
+ value: 65.22961574507966
331
+ - type: cos_sim_recall
332
+ value: 73.45646437994723
333
+ - type: dot_accuracy
334
+ value: 86.27287357692079
335
+ - type: dot_ap
336
+ value: 74.20853189774614
337
+ - type: dot_f1
338
+ value: 69.09903201787044
339
+ - type: dot_precision
340
+ value: 65.22961574507966
341
+ - type: dot_recall
342
+ value: 73.45646437994723
343
+ - type: euclidean_accuracy
344
+ value: 86.27287357692079
345
+ - type: euclidean_ap
346
+ value: 74.20857455896677
347
+ - type: euclidean_f1
348
+ value: 69.09903201787044
349
+ - type: euclidean_precision
350
+ value: 65.22961574507966
351
+ - type: euclidean_recall
352
+ value: 73.45646437994723
353
+ - type: manhattan_accuracy
354
+ value: 86.2192287059665
355
+ - type: manhattan_ap
356
+ value: 74.0513280969461
357
+ - type: manhattan_f1
358
+ value: 69.13344473621389
359
+ - type: manhattan_precision
360
+ value: 63.12118570183086
361
+ - type: manhattan_recall
362
+ value: 76.41160949868075
363
+ - type: max_accuracy
364
+ value: 86.27287357692079
365
+ - type: max_ap
366
+ value: 74.20857455896677
367
+ - type: max_f1
368
+ value: 69.13344473621389
369
+ - task:
370
+ type: PairClassification
371
+ dataset:
372
+ type: mteb/twitterurlcorpus-pairclassification
373
+ name: MTEB TwitterURLCorpus
374
+ config: default
375
+ split: test
376
+ revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf
377
+ metrics:
378
+ - type: cos_sim_accuracy
379
+ value: 89.16055419722902
380
+ - type: cos_sim_ap
381
+ value: 86.03614264194854
382
+ - type: cos_sim_f1
383
+ value: 78.89855695205357
384
+ - type: cos_sim_precision
385
+ value: 73.74656938215409
386
+ - type: cos_sim_recall
387
+ value: 84.82445334154605
388
+ - type: dot_accuracy
389
+ value: 89.16055419722902
390
+ - type: dot_ap
391
+ value: 86.03614225282097
392
+ - type: dot_f1
393
+ value: 78.89855695205357
394
+ - type: dot_precision
395
+ value: 73.74656938215409
396
+ - type: dot_recall
397
+ value: 84.82445334154605
398
+ - type: euclidean_accuracy
399
+ value: 89.16055419722902
400
+ - type: euclidean_ap
401
+ value: 86.0361548355667
402
+ - type: euclidean_f1
403
+ value: 78.89855695205357
404
+ - type: euclidean_precision
405
+ value: 73.74656938215409
406
+ - type: euclidean_recall
407
+ value: 84.82445334154605
408
+ - type: manhattan_accuracy
409
+ value: 89.11786393448985
410
+ - type: manhattan_ap
411
+ value: 86.00799361972808
412
+ - type: manhattan_f1
413
+ value: 78.84721152788472
414
+ - type: manhattan_precision
415
+ value: 75.26776338816941
416
+ - type: manhattan_recall
417
+ value: 82.78410840776101
418
+ - type: max_accuracy
419
+ value: 89.16055419722902
420
+ - type: max_ap
421
+ value: 86.0361548355667
422
+ - type: max_f1
423
+ value: 78.89855695205357
424
+ ---
425
+
426
+ # E5-large-en-ru
427
+
428
+ ## Model info
429
+
430
+ This is vocabulary pruned version of [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large).
431
+
432
+ Uses only russian and english tokens.
433
+
434
+ ### Size
435
+
436
+ | | intfloat/multilingual-e5-large | d0rj/e5-large-en-ru |
437
+ | --- | --- | --- |
438
+ | Model size (MB) | 2135.82 | 1394.8 |
439
+ | Params (count) | 559,890,946 | 365,638,14 |
440
+ | Word embeddings dim | 256,002,048 | 61,749,248 |
441
+
442
+ ### Performance
443
+
444
+ Equal performance on SberQuAD dev benchmark.
445
+
446
+ | Metric on SberQuAD (4122 questions) | intfloat/multilingual-e5-large | d0rj/e5-large-en-ru |
447
+ | --- | --- | --- |
448
+ | recall@3 | 0.787239204269772 | **0.7882096069868996** |
449
+ | map@3 | 0.7230713245997101 | **0.723192624939351** |
450
+ | mrr@3 | 0.7241630276564784 | **0.7243651948892132** |
451
+ | recall@5 | 0.8277535177098496 | **0.8284813197476953** |
452
+ | map@5 | 0.7301603186155587 | **0.7302573588872716** |
453
+ | mrr@5 | 0.7334667637069385 | **0.7335718906679607** |
454
+ | recall@10 | **0.8716642406598738** | 0.871421639980592 |
455
+ | map@10 | **0.7314774917730316** | 0.7313000338687417 |
456
+ | mrr@10 | **0.7392223685527911** | 0.7391814537556898 |
457
+
458
+ ## Usage
459
+
460
+ - Use **dot product** distance for retrieval.
461
+
462
+ - Use "query: " and "passage: " correspondingly for asymmetric tasks such as passage retrieval in open QA, ad-hoc information retrieval.
463
+
464
+ - Use "query: " prefix for symmetric tasks such as semantic similarity, bitext mining, paraphrase retrieval.
465
+
466
+ - Use "query: " prefix if you want to use embeddings as features, such as linear probing classification, clustering.
467
+
468
+ ### transformers
469
+
470
+ #### Direct usage
471
+
472
+ ```python
473
+ import torch.nn.functional as F
474
+ from torch import Tensor
475
+ from transformers import XLMRobertaTokenizer, XLMRobertaModel
476
+
477
+
478
+ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
479
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
480
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
481
+
482
+
483
+ input_texts = [
484
+ 'query: How does a corporate website differ from a business card website?',
485
+ 'query: Где был создан первый троллейбус?',
486
+ 'passage: The first trolleybus was created in Germany by engineer Werner von Siemens, probably influenced by the idea of his brother, Dr. Wilhelm Siemens, who lived in England, expressed on May 18, 1881 at the twenty-second meeting of the Royal Scientific Society. The electrical circuit was carried out by an eight-wheeled cart (Kontaktwagen) rolling along two parallel contact wires. The wires were located quite close to each other, and in strong winds they often overlapped, which led to short circuits. An experimental trolleybus line with a length of 540 m (591 yards), opened by Siemens & Halske in the Berlin suburb of Halensee, operated from April 29 to June 13, 1882.',
487
+ 'passage: Корпоративный сайт — содержит полную информацию о компании-владельце, услугах/продукции, событиях в жизни компании. Отличается от сайта-визитки и представительского сайта полнотой представленной информации, зачаст��ю содержит различные функциональные инструменты для работы с контентом (поиск и фильтры, календари событий, фотогалереи, корпоративные блоги, форумы). Может быть интегрирован с внутренними информационными системами компании-владельца (КИС, CRM, бухгалтерскими системами). Может содержать закрытые разделы для тех или иных групп пользователей — сотрудников, дилеров, контрагентов и пр.',
488
+ ]
489
+
490
+ tokenizer = XLMRobertaTokenizer.from_pretrained('d0rj/e5-large-en-ru', use_cache=False)
491
+ model = XLMRobertaModel.from_pretrained('d0rj/e5-large-en-ru', use_cache=False)
492
+
493
+ batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
494
+
495
+ outputs = model(**batch_dict)
496
+ embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
497
+
498
+ embeddings = F.normalize(embeddings, p=2, dim=1)
499
+ scores = (embeddings[:2] @ embeddings[2:].T) * 100
500
+ print(scores.tolist())
501
+ # [[68.59542846679688, 81.75910949707031], [80.36100769042969, 64.77748107910156]]
502
+ ```
503
+
504
+ #### Pipeline
505
+
506
+ ```python
507
+ from transformers import pipeline
508
+
509
+
510
+ pipe = pipeline('feature-extraction', model='d0rj/e5-large-en-ru')
511
+ embeddings = pipe(input_texts, return_tensors=True)
512
+ embeddings[0].size()
513
+ # torch.Size([1, 17, 1024])
514
+ ```
515
+
516
+ ### sentence-transformers
517
+
518
+ ```python
519
+ from sentence_transformers import SentenceTransformer
520
+
521
+
522
+ sentences = [
523
+ 'query: Что такое круглые тензоры?',
524
+ 'passage: Abstract: we introduce a novel method for compressing round tensors based on their inherent radial symmetry. We start by generalising PCA and eigen decomposition on round tensors...',
525
+ ]
526
+
527
+ model = SentenceTransformer('d0rj/e5-large-en-ru')
528
+ embeddings = model.encode(sentences, convert_to_tensor=True)
529
+ embeddings.size()
530
+ # torch.Size([2, 1024])
531
+ ```
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "d0rj/e5-large-en-ru",
3
+ "architectures": [
4
+ "XLMRobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.30.1",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 60302
28
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.30.1",
5
+ "pytorch": "1.12.1"
6
+ }
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0a7ecf8d203a2b7931d6ed58a4235464429fd8c41c1937056d257850ed1a950
3
+ size 1462599688
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a900d8829b407aaadc83b6315504ba1acdfde420b5e2288c706a0215c6b11ddb
3
+ size 1462678449
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 514,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e9d938fbab77d2bb4acd48953c66bb7ad2d0b675b4c0911e6ed25caf20acd6
3
+ size 1270564
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 1000000000000000019884624838656,
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "sp_model_kwargs": {},
18
+ "tokenizer_class": "XLMRobertaTokenizer",
19
+ "unk_token": "<unk>"
20
+ }