Spaces:

zehui127
/

Omni-DNA-Space-Multitask

Sleeping

App Files Files Community

Omni-DNA-Space-Multitask / app.py

zehui127

update

3606fab 10 months ago

raw

history blame contribute delete

2.8 kB

	import os
	import re
	import torch
	import gradio as gr
	from tqdm import tqdm
	from datasets import load_dataset, DatasetDict
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# Automatically detect GPU or use CPU
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Default model path
	model_tokenizer_path = "zehui127/Omni-DNA-Multitask"

	# Load tokenizer and model with trusted remote code
	tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_path, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(model_tokenizer_path, trust_remote_code=True).to(device)

	# List of available tasks
	tasks = ['H3', 'H4', 'H3K9ac', 'H3K14ac', 'H4ac', 'H3K4me1', 'H3K4me2', 'H3K4me3', 'H3K36me3', 'H3K79me3']
	mapping = {'1':'It is a',
	'0':'It is not a',
	'No valid prediction':'Cannot be determined whether or not it is a',
	}
	def preprocess_response(response, mask_token="[MASK]"):
	"""Extracts the response after the [MASK] token."""
	if mask_token in response:
	response = response.split(mask_token, 1)[1]
	response = re.sub(r'^[\sATGC]+', '', response)
	return response

	def generate(dna_sequence, task_type, sample_num=1):
	"""
	Generates a response based on the DNA sequence and selected task.

	Args:
	dna_sequence (str): The input DNA sequence.
	task_type (str): The selected task type.
	sample_num (int): Number of samples for the generation process.

	Returns:
	str: Predicted function label.
	"""
	if task_type is None:
	task_type = 'H3'
	dna_sequence = dna_sequence + task_type +"[MASK]"

	tokenized_message = tokenizer(
	[dna_sequence], return_tensors='pt', return_token_type_ids=False, add_special_tokens=True
	).to(device)

	response = model.generate(**tokenized_message, max_new_tokens=sample_num, do_sample=False)
	reply = tokenizer.batch_decode(response, skip_special_tokens=False)[0].replace(" ", "")
	pred = extract_label(reply, task_type)
	return f"{mapping[pred]} {task_type}"

	def extract_label(message, task_type):
	"""Extracts the prediction label from the model's response."""
	task_type = '[MASK]'
	answer = message.split(task_type)[1]
	match = re.search(r'\d+', answer)
	return match.group() if match else "No valid prediction"

	# Gradio interface
	interface = gr.Interface(
	fn=generate,
	inputs=[
	gr.Textbox(label="Input DNA Sequence", placeholder="Enter a DNA sequence"),
	gr.Dropdown(choices=tasks, label="Select Task Type"),
	],
	outputs=gr.Textbox(label="Predicted Type"),
	title="Omni-DNA Multitask Prediction",
	description="Select a DNA-related task and input a sequence to generate function predictions.",
	)

	if __name__ == "__main__":
	interface.launch()