Spaces:

Lxz20071231
/

xingzhi-lu-epq

Running on CPU Upgrade

App Files Files Community

Lxz20071231 commited on about 1 month ago

Commit

da7367a

verified ·

1 Parent(s): 41ea31b

Initial upload

Browse files

Files changed (29) hide show

app.py +263 -0
bayes.py +44 -0
bayes/class_0.joblib +3 -0
bayes/class_1.joblib +3 -0
bayes/class_10.joblib +3 -0
bayes/class_11.joblib +3 -0
bayes/class_12.joblib +3 -0
bayes/class_13.joblib +3 -0
bayes/class_14.joblib +3 -0
bayes/class_15.joblib +3 -0
bayes/class_16.joblib +3 -0
bayes/class_17.joblib +3 -0
bayes/class_18.joblib +3 -0
bayes/class_19.joblib +3 -0
bayes/class_2.joblib +3 -0
bayes/class_20.joblib +3 -0
bayes/class_21.joblib +3 -0
bayes/class_22.joblib +3 -0
bayes/class_23.joblib +3 -0
bayes/class_3.joblib +3 -0
bayes/class_4.joblib +3 -0
bayes/class_5.joblib +3 -0
bayes/class_6.joblib +3 -0
bayes/class_7.joblib +3 -0
bayes/class_8.joblib +3 -0
bayes/class_9.joblib +3 -0
bayes/vectorizer.joblib +3 -0
lstm.py +174 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import gradio as gr
+from transformers import pipeline
+from collections import defaultdict
+import re
+import string
+from textblob import TextBlob
+from bayes import NaiveBayesMultiClass
+import fasttext
+from huggingface_hub import hf_hub_download
+from lstm import LSTMPipeline, LSTMMultiClassClassifier
+topics = [
+    '1.1',
+    '1.2',
+    '1.3',
+    '1.4',
+    '1.5',
+    '1.6',
+    '1.7',
+    '1.8',
+    '2.1',
+    '2.2',
+    '2.3',
+    '3.1',
+    '3.2',
+    '3.3',
+    '3.4',
+    '4.1',
+    '4.2',
+    '4.3',
+    '4.4',
+    '4.5',
+    '5.1',
+    '5.2',
+    '6.1',
+    '6.2',
+]
+topics_full = {
+    '1': 'Motion, forces and energy',
+    '1.1': 'Physical quantities and measurement techniques',
+    '1.2': 'Motion',
+    '1.3': 'Mass and weight',
+    '1.4': 'Density',
+    '1.5': 'Forces',
+    '1.6': 'Momentum',
+    '1.7': 'Energy, work and power',
+    '1.8': 'Pressure',
+    '2': 'Thermal physics',
+    '2.1': 'Kinetic particle model of matter',
+    '2.2': 'Thermal properties and temperature',
+    '2.3': 'Transfer of thermal energy',
+    '3': 'Waves',
+    '3.1': 'General properties of waves',
+    '3.2': 'Light',
+    '3.3': 'Electromagnetic spectrum',
+    '3.4': 'Sound',
+    '4': 'Electricity and magnetism',
+    '4.1': 'Simple phenomena of magnetism',
+    '4.2': 'Electrical quantities',
+    '4.3': 'Electric circuits',
+    '4.4': 'Electrical safety',
+    '4.5': 'Electromagnetic effects',
+    '5': 'Nuclear physics',
+    '5.1': 'The nuclear model of the atom',
+    '5.2': 'Radioactivity',
+    '6': 'Space physics',
+    '6.1': 'Earth and the Solar System',
+    '6.2': 'Stars and the Universe',
+}
+embedding_model_path = hf_hub_download(
+    repo_id="facebook/fasttext-en-vectors",
+    filename="model.bin"
+)
+embedder = fasttext.load_model(embedding_model_path)
+stopword = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
+            "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they",
+            "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
+            "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
+            "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at",
+            "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
+            "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then",
+            "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
+            "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
+            "can", "will", "just", "don", "should", "now"]
+punctuations = string.punctuation
+def to_lower(text: str) -> str:
+    return text.lower()
+def remove_html_tags(text: str) -> str:
+    pattern = re.compile('<.*?>')
+    return pattern.sub(r'', text)
+def remove_punctuations(text: str) -> str:
+    return text.translate(str.maketrans('', '', punctuations))
+def correct_spellings(text: str) -> str:
+    return TextBlob(text).correct().string
+def remove_stopwords(text: str) -> str:
+    return " ".join([word for word in text.split() if word not in stopword])
+def clean(text: str) -> str:
+    return remove_stopwords(
+        correct_spellings(remove_punctuations(remove_html_tags(to_lower(text))))
+    )
+bert = pipeline(
+    "text-classification",
+    model="Lxz20071231/igcse-physics-bert",
+    tokenizer="distilbert-base-uncased",
+    return_all_scores=True,
+    function_to_apply="sigmoid",
+    truncation=True
+)
+id2label = {i: topics[i] for i in range(24)}
+lstm = LSTMPipeline(embedder=embedder, model=LSTMMultiClassClassifier.from_pretrained(
+    "Lxz20071231/igcse-physics-lstm"
+), id2label=id2label, device=-1)
+n_topics = len(topics)
+bayes = NaiveBayesMultiClass(topics)
+bayes.load('bayes/')
+def get_tags(probs, threshold = 0.5):
+    tags = []
+    for line in probs:
+        found = []
+        for p, label in zip(line, topics):
+            if p['score'] >= threshold:
+                found.append(label)
+        tags.append(found)
+    return tags
+def get_tags_multiple_bert(texts, threshold=0.5):
+    output = bert(texts)
+    return get_tags(output, threshold)
+def get_tags_bayes(text):
+    return bayes.predict(clean(text), True)
+def get_tags_cnn(text, threshold=0.5):
+    return []
+def get_tags_lstm(text, threshold=0.5):
+    return get_tags(lstm(text), threshold)[0]
+def get_tags_bert(text, threshold=0.5):
+    tags = get_tags_multiple_bert([text], threshold)[0]
+    return tags
+def expand(tags):
+    with_primary = set()
+    for i in tags:
+        with_primary.add(i[:1])
+        with_primary.add(i)
+    return sorted(list(with_primary))
+def format_as_markdown(predictions: dict) -> str:
+    if not predictions:
+        return "_No topics detected._"
+    grouped = defaultdict(list)
+    for code, topic in predictions.items():
+        main = code.split('.')[0]
+        grouped[main].append((code, topic))
+    md = "### 📝 Predicted IGCSE Physics Topics\n"
+    for main_code in sorted(grouped.keys(), key=lambda x: float(x)):
+        main_title = topics_full.get(main_code, f"{topics_full[main_code]}")
+        md += f"\n#### {main_code}. {main_title}\n"
+        subtopics = [st for st in grouped[main_code] if st[0] != main_code]
+        if subtopics:
+            for code, name in sorted(subtopics, key=lambda x: [float(n) for n in x[0].split('.')]):
+                indent = " " * (4 * (code.count('.') - 1))
+                md += f"{indent}- **{code}**: {name}\n"
+        else:
+            md += f"- **{main_code}**: {main_title}\n"
+    return md
+def classify_text(classifier, text, threshold, output_format):
+    if classifier == 'Transformer':
+        tags = get_tags_bert(text, threshold)
+    elif classifier == 'CNN':
+        tags = get_tags_cnn(text, threshold)
+    elif classifier == 'LSTM':
+        tags = get_tags_lstm(text, threshold)
+    else:
+        tags = get_tags_bayes(text)
+    tags = expand(tags)
+    predictions = {tag: topics_full[tag] for tag in tags if tag in topics_full}
+    if output_format == "JSON":
+        return predictions, gr.update(visible=True), gr.update(visible=False)
+    else:
+        md = format_as_markdown(predictions)
+        return {}, gr.update(visible=False), gr.update(value=md, visible=True)
+with gr.Blocks(theme="default") as demo:
+    gr.Markdown("# 🔬 IGCSE Physics Topic Classifier")
+    gr.Markdown(
+        "This model classifies IGCSE Physics questions or passages into syllabus topics. "
+        "Adjust the confidence threshold and choose your preferred output format."
+    )
+    with gr.Row(equal_height=True):
+        # Left column — Input
+        with gr.Column(scale=1):
+            classifier = gr.Radio(
+                ["Naïve Bayes", "CNN", "LSTM", "Transformer"],
+                value="Transformer",
+                label="Processing model",
+                info="Choose which model to use to process texts",
+            )
+            text_input = gr.Textbox(
+                lines=8,
+                placeholder="Enter a physics question or concept...",
+                label="Input Text",
+            )
+            threshold = gr.Slider(0, 1, value=0.5, step=0.05,
+                                  label="Confidence Threshold (not available for Naïve Bayes)")
+            output_format = gr.Radio(
+                ["Markdown", "JSON"],
+                value="Markdown",
+                label="Output Format",
+                info="Choose how to display results",
+            )
+            classify_btn = gr.Button("Classify", variant="primary")
+        # Right column — Output (dynamic)
+        with gr.Column(scale=1):
+            json_output = gr.JSON(label="Predicted Topics (JSON)", visible=False)
+            markdown_output = gr.Markdown(label="Predicted Topics (Markdown)", visible=True)
+    classify_btn.click(
+        fn=classify_text,
+        inputs=[classifier, text_input, threshold, output_format],
+        outputs=[json_output, json_output, markdown_output],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

bayes.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import MultiLabelBinarizer
+import typing
+import joblib
+class NaiveBayesMultiClass(object):
+    def __init__(self, classes: typing.Iterable[str]):
+        self.classes = list(classes)
+        self.n_classes = len(self.classes)
+        self.enc = MultiLabelBinarizer()
+        self.enc.fit([classes])
+        self.vectorizer = CountVectorizer()
+        self.classifiers = []
+    def load(self, path: str):
+        self.vectorizer = joblib.load(f'{path}/vectorizer.joblib')
+        self.classifiers = [
+            joblib.load(f'{path}/class_{i}.joblib') for i in range(self.n_classes)
+        ]
+    def predict(self, X: typing.Iterable[str] | str, get_tags=False):
+        if type(X) == str:
+            return self.predict([X], get_tags)[0]
+        x = self.vectorizer.transform(X)
+        by_class = [self.classifiers[i].predict(x) for i in range(self.n_classes)]
+        ans = []
+        for i in range(len(X)):
+            y = []
+            for j, cls in enumerate(self.classes):
+                if get_tags:
+                    if by_class[j][i]:
+                        y.append(cls)
+                else:
+                    y.append(by_class[j][i])
+            ans.append(y)
+        return ans
+    def __call__(self, *args, **kwargs):
+        return self.predict(*args, **kwargs)

bayes/class_0.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadf8b28353ce1fc4df5a786d28546d6cd9155f472f23e28ec129ad70bf9b814
+size 106071

bayes/class_1.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:895519139e6ca6ab9597b39202826d91b9b6f75858e502865877dae49ba4bb5a
+size 106071

bayes/class_10.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5efe768fbf47abfcc41456ad387172be57cc58fa2304838c3162fdb679ea27c
+size 106071

bayes/class_11.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0657dd6a86746f734ff16b3cbf50efa400401f5ec1c925dec12f5307adb9c571
+size 106071

bayes/class_12.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47be02b397b9052706209a7c134dd7b789b3e9c389b4fc8ce6ed3b9aa6142cef
+size 106071

bayes/class_13.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44516c260c8687b0650cd8b797d1a332e10b7415ba86aa9b3ffe11880b0366cf
+size 106071

bayes/class_14.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62e4f9daa2c9e0655acf08c53ff7a0ee902fc05f324f9131eebb0eef4d6fc4b2
+size 106071

bayes/class_15.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2860e935c0e907a5a4840d74d28e9b6fc838d1c3d9f9c765b3a8332912b2b3ac
+size 106071

bayes/class_16.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c270988af6b88e68c53ee40fd729c912930f7f58e4a42a51685ad4f4458c59c5
+size 106071

bayes/class_17.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d10b2bf71998e57e27067c346bd366b919baa395750326f580bd11bc424bada7
+size 106071

bayes/class_18.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b00f72de18850b747a2b606f22dc77b9f45df6a0057341e743d8e64fd2edc96
+size 106071

bayes/class_19.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e527c3f9fa747135b8d80befb0fa5bc72f4c5a98466e43d02d75f412e46463cf
+size 106071

bayes/class_2.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4977176edcd2392ea5af799c21adef9a1fde0f1ee42ae31acf598c428c6b5368
+size 106071

bayes/class_20.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc51fcc57f638b99f9abfed2c3136a89aea66cf8e4b235c05eef6d45437bda64
+size 106071

bayes/class_21.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f30b1c7c1ce457f4d2b60a2f1542b2f7b4ee90d2b17abab0360c307929d4fb91
+size 106071

bayes/class_22.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa5690cacca1829378bd4de994ebc24f5f48b37655574a4e390b1d1a4d0cc302
+size 106071

bayes/class_23.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08ade54cff6c56c7c269495d041ae80972977ca0f1c6a95a6a6d40e482f2451a
+size 106071

bayes/class_3.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b08cad3484fa92c1cbd59cdf079ca2bd33876eb5d367b8fa62ecf1c6a76a28e
+size 106071

bayes/class_4.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bdbbd1154b46adf91095e02343f2541ded76ff26005d7e29ee6ab9ca083c70f
+size 106071

bayes/class_5.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c32aab9bf1b26df655a17a0f69681d9782b21fad37309cd757d06754f2ebe07a
+size 106071

bayes/class_6.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdec320a6f796a52bea798e490e7c345a5bd15cc0fd00c6c249441acbf8e0cd7
+size 106071

bayes/class_7.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0f298d010d3f4aa9651988c3930ec9132c8c6925d092a3d463842262d7f7e1
+size 106071

bayes/class_8.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7627b60bea4ef9eeb433173824ad222a94c847537e141708bc40bf044c6c4ab4
+size 106071

bayes/class_9.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e3245f68cc0e8a2927a44eccaf2a7910290a62ccf9a299a70f271b727fbd329
+size 106071

bayes/vectorizer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2147176e765ef659fc27a653da99d8c8e07e0a513449e94bd64628b4a27d7cee
+size 40492

lstm.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from torch.nn.utils.rnn import pad_sequence
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence
+from types import SimpleNamespace
+from huggingface_hub import PyTorchModelHubMixin
+from transformers import Pipeline
+def get_words(model, text: str):
+    """
+    Break text into tokens using FastText's internal tokenizer.
+    """
+    lines = [model.get_line(line)[0] for line in text.split("\n")]
+    words = []
+    for line in lines:
+        for w in line[:-1]:
+            words.append(w)
+    return words
+def get_vectors(model, text: str):
+    """
+    Convert text → list of embedding vectors.
+    """
+    words = get_words(model, text)
+    vectors = [model[w] for w in words]
+    return vectors
+def get_tensor(model, text: str):
+    """
+    Convert text → (seq_len, embedding_dim) tensor
+    """
+    vectors = get_vectors(model, text)
+    if len(vectors) == 0:
+        # fallback for empty text
+        return torch.zeros(1, model.get_dimension())
+    return torch.tensor(vectors, dtype=torch.float)
+def preprocess_batch(embedder, texts):
+    """
+    Convert a list of text strings into:
+        x_padded: (batch, seq_len, emb_dim)
+        lengths:  (batch,)
+    Both sorted by sequence length (DESC) for pack_padded_sequence.
+    """
+    # Convert each text → tensor
+    seq_tensors = [get_tensor(embedder, t) for t in texts]
+    # Compute lengths BEFORE padding
+    lengths = torch.tensor([seq.size(0) for seq in seq_tensors], dtype=torch.long)
+    # Sort by length (DESC)
+    lengths_sorted, sort_idx = torch.sort(lengths, descending=True)
+    seq_tensors = [seq_tensors[i] for i in sort_idx]
+    # Pad to create (batch, max_seq_len, emb_dim)
+    x_padded = pad_sequence(seq_tensors, batch_first=True)
+    return x_padded, lengths_sorted
+class LSTMMultiClassClassifier(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, embedding_dim, hidden_dim, num_classes,
+                 num_layers=1, bidirectional=True, dropout=0.5, **kwargs):
+        super().__init__()
+        # REQUIRED for HuggingFace Pipeline
+        self.device = torch.device("cpu")
+        # Save config
+        self.config = SimpleNamespace(
+            embedding_dim=embedding_dim,
+            hidden_dim=hidden_dim,
+            num_classes=num_classes,
+            num_layers=num_layers,
+            bidirectional=bidirectional,
+            dropout=dropout
+        )
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.dropout = dropout
+        self.num_classes = num_classes
+        self.lstm = nn.LSTM(
+            input_size=embedding_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0,
+            bidirectional=bidirectional
+        )
+        direction = 2 if bidirectional else 1
+        self.fc = nn.Sequential(
+            nn.Linear(hidden_dim * direction, 128),
+            nn.ReLU(),
+            nn.Linear(128, 128),
+            nn.ReLU(),
+            nn.Linear(128, num_classes)
+        )
+    @classmethod
+    def from_config(cls, config):
+        return cls(
+            embedding_dim=config.embedding_dim,
+            hidden_dim=config.hidden_dim,
+            num_classes=config.num_classes,
+            num_layers=config.num_layers,
+            bidirectional=config.bidirectional,
+            dropout=config.dropout
+        )
+    # REQUIRED for Transformers Pipeline (updates internal device)
+    def to(self, device):
+        super().to(device)
+        self.device = device
+        return self
+    def forward(self, x, lengths):
+        x = x.to(self.device)
+        lengths = lengths.to(self.device)
+        packed = pack_padded_sequence(
+            x, lengths.cpu(), batch_first=True, enforce_sorted=True
+        )
+        _, (h_n, _) = self.lstm(packed)
+        if self.bidirectional:
+            h = torch.cat((h_n[-2], h_n[-1]), dim=1)
+        else:
+            h = h_n[-1]
+        return self.fc(h)
+class LSTMPipeline(Pipeline):
+    def __init__(self, id2label, embedder, **kwargs):
+        model = LSTMMultiClassClassifier.from_pretrained(
+            "Lxz20071231/igcse-physics-lstm"
+        )
+        super().__init__(model=model, tokenizer=None, **kwargs)
+        self.id2label = id2label
+        self.embedder = embedder
+    def preprocess(self, inputs):
+        if isinstance(inputs, str):
+            texts = [inputs]
+        else:
+            texts = list(inputs)
+        x, lengths = preprocess_batch(self.embedder, texts)
+        return {"x": x, "lengths": lengths}
+    def _forward(self, model_inputs):
+        x = model_inputs["x"]
+        lengths = model_inputs["lengths"]
+        with torch.no_grad():
+            logits = self.model(x, lengths)
+        return logits
+    def postprocess(self, logits):
+        probs = F.sigmoid(logits)
+        return probs
+    def _sanitize_parameters(self, **kwargs):
+        return {}, {}, {}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+scikit-learn
+nltk
+textblob
+fasttext
+transformers>=4.46.0
+huggingface-hub<1.0.0
+torch>=2.1