In [None]:
# pip -q install sentencepiece
# pip -q install numpy
# pip -q install sentence_transformers
# pip -q install datasets
import sentencepiece as spm
import numpy as np
from datasets import load_dataset
from collections import Counter
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
dataset = load_dataset("go_emotions")
texts = dataset["train"]["text"]
print(texts[:10])

["My favourite food is anything I didn't have to cook myself.", 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead', 'WHY THE FUCK IS BAYLESS ISOING', 'To make her feel threatened', 'Dirty Southern Wankers', "OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe PlAyOfFs! Dumbass Broncos fans circa December 2015.", 'Yes I heard abt the f bombs! That has to be why. Thanks for your reply:) until then hubby and I will anxiously wait üòù', 'We need more boards and to create a bit more space for [NAME]. Then we‚Äôll be good.', 'Damn youtube and outrage drama is super lucrative for reddit', 'It might be linked to the trust factor of your friend.']


In [None]:
def binarize(embeddings, sensitivity=0.1):
	return np.where(embeddings >= sensitivity, 1, 0)

def preprocess(strings):
	return "\n".join(["".join(map(str, s)) for s in processed_string])

# Obtain sentence embeddings
embeddings = model.encode(texts)
binary_hashes = binarize(embeddings)
binary_string = preprocess(binary_hashes)
print(binary_string[:500])

In [None]:
# Save passage to a temporary file
with open("passage.txt", "w") as f:
	f.write(binary_string)

In [2]:
# Training options documentation: https://github.com/google/sentencepiece/blob/master/doc/options.md
# Training takes 3 hours to complete on GTX 1650 mobile
spm.SentencePieceTrainer.train(
	input='passage.txt',
	model_prefix='384_bit_comp',
	vocab_size=256 + 3, # To exclude <unk>, </s>, <s>
	character_coverage=1.00,
	max_sentencepiece_length=384,
	model_type='unigram',
)

In [6]:
bpe_processor = spm.SentencePieceProcessor(model_file='384_bit_comp.model')

def encode_id(bit_text):
	encoded_pieces = bpe_processor.encode_as_pieces(bit_text)
	encoded_ids = [bpe_processor.piece_to_id(s) - 3 for s in encoded_pieces]
	assert any([id_ <= 255 for id_ in encoded_ids])
	string_ids = "".join([format(id_, "02x") for id_ in encoded_ids])
	return string_ids

def decode_id(hex_string):
	u8_array = np.frombuffer(bytes.fromhex(hex_string), dtype='<u1') + 3
	encoded_tokens = [bpe_processor.id_to_piece(int(id_)) for id_ in u8_array]
	return encoded_tokens

# Encode text
new_sentence = "000000000000000000000010000000000000000000000000000000100010010000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000100000000000000000000000000100000000000000000000000000000000000000100000000000001000000000000000000000000001000001000"
encoded_tokens = bpe_processor.encode_as_pieces(new_sentence)
encoded_ids = encode_id(new_sentence)
decoded_tokens = decode_id(encoded_ids)

print("length:", len(encoded_tokens))
print("encoded_tokens:", encoded_tokens)
print("encoded_ids:", encoded_ids)
print("same?:", encoded_tokens == decoded_tokens)

count = Counter(encoded_tokens)
print("count:", count)

length: 13
encoded_tokens: ['‚ñÅ0000000', '0000000000000001000000000000000000000', '00000000001000100', '1000000', '00000000000000000000000000000001000000000000000000000000000000000000000000000000000000', '00000000000000000001000000000000000000000000000000000', '0000000000000000000000000000000001000', '00000000000000000000000100000000000000000', '00000000010', '0000000000000000000000000000000000000100', '00000000000100000000000000000', '00000000010', '00001000']
encoded_ids: 1ab2ed09d7a9617206894e0608
same?: True
count: Counter({'00000000010': 2, '‚ñÅ0000000': 1, '0000000000000001000000000000000000000': 1, '00000000001000100': 1, '1000000': 1, '00000000000000000000000000000001000000000000000000000000000000000000000000000000000000': 1, '00000000000000000001000000000000000000000000000000000': 1, '0000000000000000000000000000000001000': 1, '00000000000000000000000100000000000000000': 1, '0000000000000000000000000000000000000100': 1, '00000000000100000000000000000': 1, '00001000': 1})
