Spaces:

Lxz20071231
/

xingzhi-lu-epq

Running on CPU Upgrade

App Files Files Community

Lxz20071231 commited on 27 days ago

Commit

1edca62

1 Parent(s): a2aec3f

Restructured codebase

Browse files

Files changed (33) hide show

.idea/.gitignore +8 -0
app.py +11 -152
bayes.py → models/bayes.py +46 -44
models/bert.py +27 -0
models/cnn.py +2 -0
lstm.py → models/lstm.py +182 -174
utils/consts.py +59 -0
utils/preprocessing.py +41 -0
{bayes → weights/bayes}/class_0.joblib +0 -0
{bayes → weights/bayes}/class_1.joblib +0 -0
{bayes → weights/bayes}/class_10.joblib +0 -0
{bayes → weights/bayes}/class_11.joblib +0 -0
{bayes → weights/bayes}/class_12.joblib +0 -0
{bayes → weights/bayes}/class_13.joblib +0 -0
{bayes → weights/bayes}/class_14.joblib +0 -0
{bayes → weights/bayes}/class_15.joblib +0 -0
{bayes → weights/bayes}/class_16.joblib +0 -0
{bayes → weights/bayes}/class_17.joblib +0 -0
{bayes → weights/bayes}/class_18.joblib +0 -0
{bayes → weights/bayes}/class_19.joblib +0 -0
{bayes → weights/bayes}/class_2.joblib +0 -0
{bayes → weights/bayes}/class_20.joblib +0 -0
{bayes → weights/bayes}/class_21.joblib +0 -0
{bayes → weights/bayes}/class_22.joblib +0 -0
{bayes → weights/bayes}/class_23.joblib +0 -0
{bayes → weights/bayes}/class_3.joblib +0 -0
{bayes → weights/bayes}/class_4.joblib +0 -0
{bayes → weights/bayes}/class_5.joblib +0 -0
{bayes → weights/bayes}/class_6.joblib +0 -0
{bayes → weights/bayes}/class_7.joblib +0 -0
{bayes → weights/bayes}/class_8.joblib +0 -0
{bayes → weights/bayes}/class_9.joblib +0 -0
{bayes → weights/bayes}/vectorizer.joblib +0 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

app.py CHANGED Viewed

@@ -1,73 +1,13 @@
 import gradio as gr
-from transformers import pipeline
-from collections import defaultdict
-import re
-import string
-from textblob import TextBlob
-from bayes import NaiveBayesMultiClass
 import fasttext
 from huggingface_hub import hf_hub_download
-from lstm import LSTMPipeline
-topics = [
-    '1.1',
-    '1.2',
-    '1.3',
-    '1.4',
-    '1.5',
-    '1.6',
-    '1.7',
-    '1.8',
-    '2.1',
-    '2.2',
-    '2.3',
-    '3.1',
-    '3.2',
-    '3.3',
-    '3.4',
-    '4.1',
-    '4.2',
-    '4.3',
-    '4.4',
-    '4.5',
-    '5.1',
-    '5.2',
-    '6.1',
-    '6.2',
-]
-topics_full = {
-    '1': 'Motion, forces and energy',
-    '1.1': 'Physical quantities and measurement techniques',
-    '1.2': 'Motion',
-    '1.3': 'Mass and weight',
-    '1.4': 'Density',
-    '1.5': 'Forces',
-    '1.6': 'Momentum',
-    '1.7': 'Energy, work and power',
-    '1.8': 'Pressure',
-    '2': 'Thermal physics',
-    '2.1': 'Kinetic particle model of matter',
-    '2.2': 'Thermal properties and temperature',
-    '2.3': 'Transfer of thermal energy',
-    '3': 'Waves',
-    '3.1': 'General properties of waves',
-    '3.2': 'Light',
-    '3.3': 'Electromagnetic spectrum',
-    '3.4': 'Sound',
-    '4': 'Electricity and magnetism',
-    '4.1': 'Simple phenomena of magnetism',
-    '4.2': 'Electrical quantities',
-    '4.3': 'Electric circuits',
-    '4.4': 'Electrical safety',
-    '4.5': 'Electromagnetic effects',
-    '5': 'Nuclear physics',
-    '5.1': 'The nuclear model of the atom',
-    '5.2': 'Radioactivity',
-    '6': 'Space physics',
-    '6.1': 'Earth and the Solar System',
-    '6.2': 'Stars and the Universe',
-}
 embedding_model_path = hf_hub_download(
     repo_id="facebook/fasttext-en-vectors",
@@ -75,54 +15,7 @@ embedding_model_path = hf_hub_download(
 )
 embedder = fasttext.load_model(embedding_model_path)
-stopword = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
-            "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they",
-            "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
-            "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
-            "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at",
-            "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
-            "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then",
-            "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
-            "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
-            "can", "will", "just", "don", "should", "now"]
-punctuations = string.punctuation
-def to_lower(text: str) -> str:
-    return text.lower()
-def remove_html_tags(text: str) -> str:
-    pattern = re.compile('<.*?>')
-    return pattern.sub(r'', text)
-def remove_punctuations(text: str) -> str:
-    return text.translate(str.maketrans('', '', punctuations))
-def correct_spellings(text: str) -> str:
-    return TextBlob(text).correct().string
-def remove_stopwords(text: str) -> str:
-    return " ".join([word for word in text.split() if word not in stopword])
-def clean(text: str) -> str:
-    return remove_stopwords(
-        correct_spellings(remove_punctuations(remove_html_tags(to_lower(text))))
-    )
-bert = pipeline(
-    "text-classification",
-    model="Lxz20071231/igcse-physics-bert",
-    tokenizer="distilbert-base-uncased",
-    return_all_scores=True,
-    function_to_apply="sigmoid",
-    truncation=True
-)
 id2label = {i: topics[i] for i in range(24)}
@@ -131,42 +24,8 @@ lstm = LSTMPipeline(embedder=embedder, id2label=id2label, device=-1)
 n_topics = len(topics)
 bayes = NaiveBayesMultiClass(topics)
-bayes.load('bayes/')
-def get_tags_multiple_bert(texts, threshold=0.5):
-    probs = bert(texts)
-    tags = []
-    for line in probs:
-        found = []
-        for p, label in zip(line, topics):
-            if p['score'] >= threshold:
-                found.append(label)
-        tags.append(found)
-    return tags
-def get_tags_bayes(text):
-    return bayes.predict(clean(text), True)
-def get_tags_cnn(text, threshold=0.5):
-    return []
-def get_tags_lstm(text, threshold=0.5):
-    probs = lstm(text)[0]
-    tags = []
-    for p, label in zip(probs, topics):
-        if p >= threshold:
-            tags.append(label)
-    return tags
-def get_tags_bert(text, threshold=0.5):
-    tags = get_tags_multiple_bert([text], threshold)[0]
-    return tags
 def expand(tags):
     with_primary = set()
@@ -201,13 +60,13 @@ def format_as_markdown(predictions: dict) -> str:
 def classify_text(classifier, text, threshold, output_format):
     if classifier == 'Transformer':
-        tags = get_tags_bert(text, threshold)
     elif classifier == 'CNN':
-        tags = get_tags_cnn(text, threshold)
     elif classifier == 'LSTM':
-        tags = get_tags_lstm(text, threshold)
     else:
-        tags = get_tags_bayes(text)
     tags = expand(tags)
     predictions = {tag: topics_full[tag] for tag in tags if tag in topics_full}

 import gradio as gr
 import fasttext
 from huggingface_hub import hf_hub_download
+from collections import defaultdict
+from utils.consts import topics, topics_full
+from models.bayes import NaiveBayesMultiClass, get_tags_bayes
+from models.lstm import LSTMPipeline, get_tags_lstm
+from models.bert import get_bert_pipeline, get_tags_bert
 embedding_model_path = hf_hub_download(
     repo_id="facebook/fasttext-en-vectors",
 )
 embedder = fasttext.load_model(embedding_model_path)
+bert = get_bert_pipeline()
 id2label = {i: topics[i] for i in range(24)}
 n_topics = len(topics)
 bayes = NaiveBayesMultiClass(topics)
+bayes.load('weights/bayes/')
 def expand(tags):
     with_primary = set()
 def classify_text(classifier, text, threshold, output_format):
     if classifier == 'Transformer':
+        tags = get_tags_bert(bert, text, threshold)
     elif classifier == 'CNN':
+        tags = []
     elif classifier == 'LSTM':
+        tags = get_tags_lstm(lstm, text, threshold)
     else:
+        tags = get_tags_bayes(bayes, text)
     tags = expand(tags)
     predictions = {tag: topics_full[tag] for tag in tags if tag in topics_full}

bayes.py → models/bayes.py RENAMED Viewed

@@ -1,44 +1,46 @@
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.preprocessing import MultiLabelBinarizer
-import typing
-import joblib
-class NaiveBayesMultiClass(object):
-    def __init__(self, classes: typing.Iterable[str]):
-        self.classes = list(classes)
-        self.n_classes = len(self.classes)
-        self.enc = MultiLabelBinarizer()
-        self.enc.fit([classes])
-        self.vectorizer = CountVectorizer()
-        self.classifiers = []
-    def load(self, path: str):
-        self.vectorizer = joblib.load(f'{path}/vectorizer.joblib')
-        self.classifiers = [
-            joblib.load(f'{path}/class_{i}.joblib') for i in range(self.n_classes)
-        ]
-    def predict(self, X: typing.Iterable[str] | str, get_tags=False):
-        if type(X) == str:
-            return self.predict([X], get_tags)[0]
-        x = self.vectorizer.transform(X)
-        by_class = [self.classifiers[i].predict(x) for i in range(self.n_classes)]
-        ans = []
-        for i in range(len(X)):
-            y = []
-            for j, cls in enumerate(self.classes):
-                if get_tags:
-                    if by_class[j][i]:
-                        y.append(cls)
-                else:
-                    y.append(by_class[j][i])
-            ans.append(y)
-        return ans
-    def __call__(self, *args, **kwargs):
-        return self.predict(*args, **kwargs)

+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer
+import typing
+import joblib
+from utils.preprocessing import clean
+class NaiveBayesMultiClass(object):
+    def __init__(self, classes: typing.Iterable[str]):
+        self.classes = list(classes)
+        self.n_classes = len(self.classes)
+        self.enc = MultiLabelBinarizer()
+        self.enc.fit([classes])
+        self.vectorizer = CountVectorizer()
+        self.classifiers = []
+    def load(self, path: str):
+        self.vectorizer = joblib.load(f'{path}/vectorizer.joblib')
+        self.classifiers = [
+            joblib.load(f'{path}/class_{i}.joblib') for i in range(self.n_classes)
+        ]
+    def predict(self, X: typing.Iterable[str] | str, get_tags=False):
+        if type(X) == str:
+            return self.predict([X], get_tags)[0]
+        x = self.vectorizer.transform(X)
+        by_class = [self.classifiers[i].predict(x) for i in range(self.n_classes)]
+        ans = []
+        for i in range(len(X)):
+            y = []
+            for j, cls in enumerate(self.classes):
+                if get_tags:
+                    if by_class[j][i]:
+                        y.append(cls)
+                else:
+                    y.append(by_class[j][i])
+            ans.append(y)
+        return ans
+    def __call__(self, *args, **kwargs):
+        return self.predict(*args, **kwargs)
+def get_tags_bayes(model, text):
+    return model.predict(clean(text), True)

models/bert.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import pipeline
+from utils.consts import topics
+def get_bert_pipeline():
+    return pipeline(
+        "text-classification",
+        model="Lxz20071231/igcse-physics-bert",
+        tokenizer="distilbert-base-uncased",
+        return_all_scores=True,
+        function_to_apply="sigmoid",
+        truncation=True
+    )
+def get_tags_multiple_bert(model, texts, threshold=0.5):
+    probs = model(texts)
+    tags = []
+    for line in probs:
+        found = []
+        for p, label in zip(line, topics):
+            if p['score'] >= threshold:
+                found.append(label)
+        tags.append(found)
+    return tags
+def get_tags_bert(model, text, threshold=0.5):
+    tags = get_tags_multiple_bert(model, [text], threshold)[0]
+    return tags

models/cnn.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def get_tags_cnn(model, text, threshold=0.5):
2	+ return []

lstm.py → models/lstm.py RENAMED Viewed

@@ -1,174 +1,182 @@
-from torch.nn.utils.rnn import pad_sequence
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pack_padded_sequence
-from types import SimpleNamespace
-from huggingface_hub import PyTorchModelHubMixin
-from transformers import Pipeline
-def get_words(model, text: str):
-    """
-    Break text into tokens using FastText's internal tokenizer.
-    """
-    lines = [model.get_line(line)[0] for line in text.split("\n")]
-    words = []
-    for line in lines:
-        for w in line[:-1]:
-            words.append(w)
-    return words
-def get_vectors(model, text: str):
-    """
-    Convert text → list of embedding vectors.
-    """
-    words = get_words(model, text)
-    vectors = [model[w] for w in words]
-    return vectors
-def get_tensor(model, text: str):
-    """
-    Convert text → (seq_len, embedding_dim) tensor
-    """
-    vectors = get_vectors(model, text)
-    if len(vectors) == 0:
-        # fallback for empty text
-        return torch.zeros(1, model.get_dimension())
-    return torch.tensor(vectors, dtype=torch.float)
-def preprocess_batch(embedder, texts):
-    """
-    Convert a list of text strings into:
-        x_padded: (batch, seq_len, emb_dim)
-        lengths:  (batch,)
-    Both sorted by sequence length (DESC) for pack_padded_sequence.
-    """
-    # Convert each text → tensor
-    seq_tensors = [get_tensor(embedder, t) for t in texts]
-    # Compute lengths BEFORE padding
-    lengths = torch.tensor([seq.size(0) for seq in seq_tensors], dtype=torch.long)
-    # Sort by length (DESC)
-    lengths_sorted, sort_idx = torch.sort(lengths, descending=True)
-    seq_tensors = [seq_tensors[i] for i in sort_idx]
-    # Pad to create (batch, max_seq_len, emb_dim)
-    x_padded = pad_sequence(seq_tensors, batch_first=True)
-    return x_padded, lengths_sorted
-class LSTMMultiClassClassifier(nn.Module, PyTorchModelHubMixin):
-    def __init__(self, embedding_dim, hidden_dim, num_classes,
-                 num_layers=1, bidirectional=True, dropout=0.5, **kwargs):
-        super().__init__()
-        # REQUIRED for HuggingFace Pipeline
-        self.device = torch.device("cpu")
-        # Save config
-        self.config = SimpleNamespace(
-            embedding_dim=embedding_dim,
-            hidden_dim=hidden_dim,
-            num_classes=num_classes,
-            num_layers=num_layers,
-            bidirectional=bidirectional,
-            dropout=dropout
-        )
-        self.embedding_dim = embedding_dim
-        self.hidden_dim = hidden_dim
-        self.num_layers = num_layers
-        self.bidirectional = bidirectional
-        self.dropout = dropout
-        self.num_classes = num_classes
-        self.lstm = nn.LSTM(
-            input_size=embedding_dim,
-            hidden_size=hidden_dim,
-            num_layers=num_layers,
-            batch_first=True,
-            dropout=dropout if num_layers > 1 else 0,
-            bidirectional=bidirectional
-        )
-        direction = 2 if bidirectional else 1
-        self.fc = nn.Sequential(
-            nn.Linear(hidden_dim * direction, 128),
-            nn.ReLU(),
-            nn.Linear(128, 128),
-            nn.ReLU(),
-            nn.Linear(128, num_classes)
-        )
-    @classmethod
-    def from_config(cls, config):
-        return cls(
-            embedding_dim=config.embedding_dim,
-            hidden_dim=config.hidden_dim,
-            num_classes=config.num_classes,
-            num_layers=config.num_layers,
-            bidirectional=config.bidirectional,
-            dropout=config.dropout
-        )
-    # REQUIRED for Transformers Pipeline (updates internal device)
-    def to(self, device):
-        super().to(device)
-        self.device = device
-        return self
-    def forward(self, x, lengths):
-        x = x.to(self.device)
-        lengths = lengths.to(self.device)
-        packed = pack_padded_sequence(
-            x, lengths.cpu(), batch_first=True, enforce_sorted=True
-        )
-        _, (h_n, _) = self.lstm(packed)
-        if self.bidirectional:
-            h = torch.cat((h_n[-2], h_n[-1]), dim=1)
-        else:
-            h = h_n[-1]
-        return self.fc(h)
-class LSTMPipeline(Pipeline):
-    def __init__(self, id2label, embedder, **kwargs):
-        model = LSTMMultiClassClassifier.from_pretrained(
-            "Lxz20071231/igcse-physics-lstm"
-        )
-        super().__init__(model=model, tokenizer=None, **kwargs)
-        self.id2label = id2label
-        self.embedder = embedder
-    def preprocess(self, inputs):
-        if isinstance(inputs, str):
-            texts = [inputs]
-        else:
-            texts = list(inputs)
-        x, lengths = preprocess_batch(self.embedder, texts)
-        return {"x": x, "lengths": lengths}
-    def _forward(self, model_inputs):
-        x = model_inputs["x"]
-        lengths = model_inputs["lengths"]
-        with torch.no_grad():
-            logits = self.model(x, lengths)
-        return logits
-    def postprocess(self, logits):
-        probs = F.sigmoid(logits)
-        return probs
-    def _sanitize_parameters(self, **kwargs):
-        return {}, {}, {}

+from torch.nn.utils.rnn import pad_sequence
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence
+from types import SimpleNamespace
+from huggingface_hub import PyTorchModelHubMixin
+from transformers import Pipeline
+from utils.consts import topics
+def get_words(model, text: str):
+    """
+    Break text into tokens using FastText's internal tokenizer.
+    """
+    lines = [model.get_line(line)[0] for line in text.split("\n")]
+    words = []
+    for line in lines:
+        for w in line[:-1]:
+            words.append(w)
+    return words
+def get_vectors(model, text: str):
+    """
+    Convert text → list of embedding vectors.
+    """
+    words = get_words(model, text)
+    vectors = [model[w] for w in words]
+    return vectors
+def get_tensor(model, text: str):
+    """
+    Convert text → (seq_len, embedding_dim) tensor
+    """
+    vectors = get_vectors(model, text)
+    if len(vectors) == 0:
+        # fallback for empty text
+        return torch.zeros(1, model.get_dimension())
+    return torch.tensor(vectors, dtype=torch.float)
+def preprocess_batch(embedder, texts):
+    """
+    Convert a list of text strings into:
+        x_padded: (batch, seq_len, emb_dim)
+        lengths:  (batch,)
+    Both sorted by sequence length (DESC) for pack_padded_sequence.
+    """
+    # Convert each text → tensor
+    seq_tensors = [get_tensor(embedder, t) for t in texts]
+    # Compute lengths BEFORE padding
+    lengths = torch.tensor([seq.size(0) for seq in seq_tensors], dtype=torch.long)
+    # Sort by length (DESC)
+    lengths_sorted, sort_idx = torch.sort(lengths, descending=True)
+    seq_tensors = [seq_tensors[i] for i in sort_idx]
+    # Pad to create (batch, max_seq_len, emb_dim)
+    x_padded = pad_sequence(seq_tensors, batch_first=True)
+    return x_padded, lengths_sorted
+class LSTMMultiClassClassifier(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, embedding_dim, hidden_dim, num_classes,
+                 num_layers=1, bidirectional=True, dropout=0.5, **kwargs):
+        super().__init__()
+        # REQUIRED for HuggingFace Pipeline
+        self.device = torch.device("cpu")
+        # Save config
+        self.config = SimpleNamespace(
+            embedding_dim=embedding_dim,
+            hidden_dim=hidden_dim,
+            num_classes=num_classes,
+            num_layers=num_layers,
+            bidirectional=bidirectional,
+            dropout=dropout
+        )
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.dropout = dropout
+        self.num_classes = num_classes
+        self.lstm = nn.LSTM(
+            input_size=embedding_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0,
+            bidirectional=bidirectional
+        )
+        direction = 2 if bidirectional else 1
+        self.fc = nn.Sequential(
+            nn.Linear(hidden_dim * direction, 128),
+            nn.ReLU(),
+            nn.Linear(128, 128),
+            nn.ReLU(),
+            nn.Linear(128, num_classes)
+        )
+    @classmethod
+    def from_config(cls, config):
+        return cls(
+            embedding_dim=config.embedding_dim,
+            hidden_dim=config.hidden_dim,
+            num_classes=config.num_classes,
+            num_layers=config.num_layers,
+            bidirectional=config.bidirectional,
+            dropout=config.dropout
+        )
+    # REQUIRED for Transformers Pipeline (updates internal device)
+    def to(self, device):
+        super().to(device)
+        self.device = device
+        return self
+    def forward(self, x, lengths):
+        x = x.to(self.device)
+        lengths = lengths.to(self.device)
+        packed = pack_padded_sequence(
+            x, lengths.cpu(), batch_first=True, enforce_sorted=True
+        )
+        _, (h_n, _) = self.lstm(packed)
+        if self.bidirectional:
+            h = torch.cat((h_n[-2], h_n[-1]), dim=1)
+        else:
+            h = h_n[-1]
+        return self.fc(h)
+class LSTMPipeline(Pipeline):
+    def __init__(self, id2label, embedder, **kwargs):
+        model = LSTMMultiClassClassifier.from_pretrained(
+            "Lxz20071231/igcse-physics-lstm"
+        )
+        super().__init__(model=model, tokenizer=None, **kwargs)
+        self.id2label = id2label
+        self.embedder = embedder
+    def preprocess(self, inputs):
+        if isinstance(inputs, str):
+            texts = [inputs]
+        else:
+            texts = list(inputs)
+        x, lengths = preprocess_batch(self.embedder, texts)
+        return {"x": x, "lengths": lengths}
+    def _forward(self, model_inputs):
+        x = model_inputs["x"]
+        lengths = model_inputs["lengths"]
+        with torch.no_grad():
+            logits = self.model(x, lengths)
+        return logits
+    def postprocess(self, logits):
+        probs = F.sigmoid(logits)
+        return probs
+    def _sanitize_parameters(self, **kwargs):
+        return {}, {}, {}
+def get_tags_lstm(model, text, threshold=0.5):
+    probs = model(text)[0]
+    tags = []
+    for p, label in zip(probs, topics):
+        if p >= threshold:
+            tags.append(label)
+    return tags

utils/consts.py ADDED Viewed

	@@ -0,0 +1,59 @@

+topics = [
+    '1.1',
+    '1.2',
+    '1.3',
+    '1.4',
+    '1.5',
+    '1.6',
+    '1.7',
+    '1.8',
+    '2.1',
+    '2.2',
+    '2.3',
+    '3.1',
+    '3.2',
+    '3.3',
+    '3.4',
+    '4.1',
+    '4.2',
+    '4.3',
+    '4.4',
+    '4.5',
+    '5.1',
+    '5.2',
+    '6.1',
+    '6.2',
+]
+topics_full = {
+    '1': 'Motion, forces and energy',
+    '1.1': 'Physical quantities and measurement techniques',
+    '1.2': 'Motion',
+    '1.3': 'Mass and weight',
+    '1.4': 'Density',
+    '1.5': 'Forces',
+    '1.6': 'Momentum',
+    '1.7': 'Energy, work and power',
+    '1.8': 'Pressure',
+    '2': 'Thermal physics',
+    '2.1': 'Kinetic particle model of matter',
+    '2.2': 'Thermal properties and temperature',
+    '2.3': 'Transfer of thermal energy',
+    '3': 'Waves',
+    '3.1': 'General properties of waves',
+    '3.2': 'Light',
+    '3.3': 'Electromagnetic spectrum',
+    '3.4': 'Sound',
+    '4': 'Electricity and magnetism',
+    '4.1': 'Simple phenomena of magnetism',
+    '4.2': 'Electrical quantities',
+    '4.3': 'Electric circuits',
+    '4.4': 'Electrical safety',
+    '4.5': 'Electromagnetic effects',
+    '5': 'Nuclear physics',
+    '5.1': 'The nuclear model of the atom',
+    '5.2': 'Radioactivity',
+    '6': 'Space physics',
+    '6.1': 'Earth and the Solar System',
+    '6.2': 'Stars and the Universe',
+}

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import re
+import string
+from textblob import TextBlob
+stopword = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
+            "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they",
+            "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
+            "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
+            "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at",
+            "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
+            "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then",
+            "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
+            "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
+            "can", "will", "just", "don", "should", "now"]
+punctuations = string.punctuation
+def to_lower(text: str) -> str:
+    return text.lower()
+def remove_html_tags(text: str) -> str:
+    pattern = re.compile('<.*?>')
+    return pattern.sub(r'', text)
+def remove_punctuations(text: str) -> str:
+    return text.translate(str.maketrans('', '', punctuations))
+def correct_spellings(text: str) -> str:
+    return TextBlob(text).correct().string
+def remove_stopwords(text: str) -> str:
+    return " ".join([word for word in text.split() if word not in stopword])
+def clean(text: str) -> str:
+    return remove_stopwords(
+        correct_spellings(remove_punctuations(remove_html_tags(to_lower(text))))
+    )

{bayes → weights/bayes}/class_0.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_1.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_10.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_11.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_12.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_13.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_14.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_15.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_16.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_17.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_18.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_19.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_2.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_20.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_21.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_22.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_23.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_3.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_4.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_5.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_6.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_7.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_8.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/class_9.joblib RENAMED Viewed

File without changes

{bayes → weights/bayes}/vectorizer.joblib RENAMED Viewed

File without changes