Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

App Files Files Community

topic-clustering-global-dashboard / merge_topic.py

cuongnguyen910

Upload folder using huggingface_hub

5120311 verified over 1 year ago

raw

history blame contribute delete

4.4 kB

	import json
	import os

	# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

	from function.topic_clustering import model, AgglomerativeClustering

	def check_duplicate_title_domain(docs):
	lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
	for i in range(1,len(lst_title_domain) -1):
	for j in range(i+1,len(lst_title_domain)):
	if lst_title_domain[j] == lst_title_domain[i]:
	lst_title_domain[j] = 'dup'
	lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
	return lst_filter_docs

	def main(req):
	# threshold = req.get('threshold', 0.3)
	type = req['type']
	if type == 'monthly':
	MAX_CLUSTER = 50
	else:
	MAX_CLUSTER = 20

	MAX_NUM_DOC_PER_CLUSTER = 50

	threshold = 0.4

	preprocess = req.get('preprocess', [])
	lst_labels = []
	lst_topics = []
	for date_clusters in preprocess:
	# date = date_clusters['date']
	topic = date_clusters.get('topic', [])
	if topic:
	for topic_id in topic:
	# print(topic_id)
	topic_docs = topic[topic_id]
	lst_topics.append(topic[topic_id])
	label = '. '.join([topic_docs[0].get('title',''),topic_docs[0].get('snippet','')])
	lst_labels.append(label)

	final_clusters = []
	label_clusters = sbert_clustering(lst_labels, distance_threshold=threshold,return_ids=True)

	# print(lst_labels)
	print(label_clusters)

	if label_clusters:
	for id_label_clusters in label_clusters:
	merge_clusters = []
	num_docs = 0
	for topic_id in label_clusters[id_label_clusters]:
	topic = lst_topics[topic_id]
	count_doc = topic[0].get('num_docs',1)
	num_docs += count_doc
	merge_clusters.extend(lst_topics[topic_id])

	merge_clusters = sorted(merge_clusters, key=lambda x: -x.get('created_time',0))
	merge_clusters = check_duplicate_title_domain(merge_clusters)

	merge_clusters = merge_clusters[:MAX_NUM_DOC_PER_CLUSTER]
	for doc in merge_clusters:
	doc['num_docs'] = num_docs
	final_clusters.append(merge_clusters)

	final_clusters = sorted(final_clusters, key=lambda x: -x[0]['num_docs'])
	final_clusters = final_clusters[:MAX_CLUSTER]

	final_result = {}
	for i,cluster in enumerate(final_clusters):
	final_result[i] = cluster
	with open('zzz.json','w') as f:
	json.dump(final_result, f, ensure_ascii=False)
	return final_result

	def get_sbert_embedding(lst_sentence):
	embs = model.encode(lst_sentence)
	# embs = np.array(embs)
	return embs

	def sbert_clustering(lst_sentence, distance_threshold=0.25, return_ids = False):
	lst_sentence = [sen.replace('_',' ') for sen in lst_sentence]
	if len(lst_sentence) == 0:
	return
	if len(lst_sentence) == 1:
	if return_ids:
	return {
	0: [0]
	}
	return {
	0: lst_sentence
	}

	# embs = model.encode(lst_sentence, show_progress_bar=True)
	embs = get_sbert_embedding(lst_sentence)

	hyer_clusteror = AgglomerativeClustering(n_clusters = None,compute_full_tree = True, affinity = 'cosine',
	linkage = 'complete', distance_threshold=distance_threshold)
	# print(f'[INFO] Len lst_sentence: {len(lst_sentence)}')
	# print(f'[INFO] Len embs: {len(embs)}')
	hyer_clusteror.fit(embs)
	# print(hyer_clusteror.n_clusters_)

	dict_result = {}
	dict_ids = {}
	for i in range(hyer_clusteror.n_clusters_):
	if i not in dict_result:
	dict_result[i] = []
	dict_ids[i] = []
	for j in range(len(lst_sentence)):
	if hyer_clusteror.labels_[j] == i:
	dict_result[i].append(lst_sentence[j])
	dict_ids[i].append(j)

	if return_ids:
	output = dict_ids
	else:
	output = dict_result
	result = dict(sorted(output.items(), key=lambda i: -len(i[1])))
	return result

	if __name__ == '__main__':
	with open("input_merge.json",'r') as f:
	req = json.load(f)
	main(req)