Lxz20071231 commited on
Commit
1edca62
·
1 Parent(s): a2aec3f

Restructured codebase

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
app.py CHANGED
@@ -1,73 +1,13 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
- from collections import defaultdict
4
- import re
5
- import string
6
- from textblob import TextBlob
7
- from bayes import NaiveBayesMultiClass
8
  import fasttext
9
  from huggingface_hub import hf_hub_download
10
- from lstm import LSTMPipeline
11
 
12
- topics = [
13
- '1.1',
14
- '1.2',
15
- '1.3',
16
- '1.4',
17
- '1.5',
18
- '1.6',
19
- '1.7',
20
- '1.8',
21
- '2.1',
22
- '2.2',
23
- '2.3',
24
- '3.1',
25
- '3.2',
26
- '3.3',
27
- '3.4',
28
- '4.1',
29
- '4.2',
30
- '4.3',
31
- '4.4',
32
- '4.5',
33
- '5.1',
34
- '5.2',
35
- '6.1',
36
- '6.2',
37
- ]
38
 
39
- topics_full = {
40
- '1': 'Motion, forces and energy',
41
- '1.1': 'Physical quantities and measurement techniques',
42
- '1.2': 'Motion',
43
- '1.3': 'Mass and weight',
44
- '1.4': 'Density',
45
- '1.5': 'Forces',
46
- '1.6': 'Momentum',
47
- '1.7': 'Energy, work and power',
48
- '1.8': 'Pressure',
49
- '2': 'Thermal physics',
50
- '2.1': 'Kinetic particle model of matter',
51
- '2.2': 'Thermal properties and temperature',
52
- '2.3': 'Transfer of thermal energy',
53
- '3': 'Waves',
54
- '3.1': 'General properties of waves',
55
- '3.2': 'Light',
56
- '3.3': 'Electromagnetic spectrum',
57
- '3.4': 'Sound',
58
- '4': 'Electricity and magnetism',
59
- '4.1': 'Simple phenomena of magnetism',
60
- '4.2': 'Electrical quantities',
61
- '4.3': 'Electric circuits',
62
- '4.4': 'Electrical safety',
63
- '4.5': 'Electromagnetic effects',
64
- '5': 'Nuclear physics',
65
- '5.1': 'The nuclear model of the atom',
66
- '5.2': 'Radioactivity',
67
- '6': 'Space physics',
68
- '6.1': 'Earth and the Solar System',
69
- '6.2': 'Stars and the Universe',
70
- }
71
 
72
  embedding_model_path = hf_hub_download(
73
  repo_id="facebook/fasttext-en-vectors",
@@ -75,54 +15,7 @@ embedding_model_path = hf_hub_download(
75
  )
76
  embedder = fasttext.load_model(embedding_model_path)
77
 
78
- stopword = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
79
- "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they",
80
- "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
81
- "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
82
- "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at",
83
- "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
84
- "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then",
85
- "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
86
- "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
87
- "can", "will", "just", "don", "should", "now"]
88
- punctuations = string.punctuation
89
-
90
-
91
- def to_lower(text: str) -> str:
92
- return text.lower()
93
-
94
-
95
- def remove_html_tags(text: str) -> str:
96
- pattern = re.compile('<.*?>')
97
- return pattern.sub(r'', text)
98
-
99
-
100
- def remove_punctuations(text: str) -> str:
101
- return text.translate(str.maketrans('', '', punctuations))
102
-
103
-
104
- def correct_spellings(text: str) -> str:
105
- return TextBlob(text).correct().string
106
-
107
-
108
- def remove_stopwords(text: str) -> str:
109
- return " ".join([word for word in text.split() if word not in stopword])
110
-
111
-
112
- def clean(text: str) -> str:
113
- return remove_stopwords(
114
- correct_spellings(remove_punctuations(remove_html_tags(to_lower(text))))
115
- )
116
-
117
-
118
- bert = pipeline(
119
- "text-classification",
120
- model="Lxz20071231/igcse-physics-bert",
121
- tokenizer="distilbert-base-uncased",
122
- return_all_scores=True,
123
- function_to_apply="sigmoid",
124
- truncation=True
125
- )
126
 
127
  id2label = {i: topics[i] for i in range(24)}
128
 
@@ -131,42 +24,8 @@ lstm = LSTMPipeline(embedder=embedder, id2label=id2label, device=-1)
131
  n_topics = len(topics)
132
 
133
  bayes = NaiveBayesMultiClass(topics)
134
- bayes.load('bayes/')
135
-
136
-
137
-
138
- def get_tags_multiple_bert(texts, threshold=0.5):
139
- probs = bert(texts)
140
- tags = []
141
- for line in probs:
142
- found = []
143
- for p, label in zip(line, topics):
144
- if p['score'] >= threshold:
145
- found.append(label)
146
- tags.append(found)
147
- return tags
148
-
149
-
150
- def get_tags_bayes(text):
151
- return bayes.predict(clean(text), True)
152
-
153
-
154
- def get_tags_cnn(text, threshold=0.5):
155
- return []
156
-
157
-
158
- def get_tags_lstm(text, threshold=0.5):
159
- probs = lstm(text)[0]
160
- tags = []
161
- for p, label in zip(probs, topics):
162
- if p >= threshold:
163
- tags.append(label)
164
- return tags
165
-
166
 
167
- def get_tags_bert(text, threshold=0.5):
168
- tags = get_tags_multiple_bert([text], threshold)[0]
169
- return tags
170
 
171
  def expand(tags):
172
  with_primary = set()
@@ -201,13 +60,13 @@ def format_as_markdown(predictions: dict) -> str:
201
 
202
  def classify_text(classifier, text, threshold, output_format):
203
  if classifier == 'Transformer':
204
- tags = get_tags_bert(text, threshold)
205
  elif classifier == 'CNN':
206
- tags = get_tags_cnn(text, threshold)
207
  elif classifier == 'LSTM':
208
- tags = get_tags_lstm(text, threshold)
209
  else:
210
- tags = get_tags_bayes(text)
211
 
212
  tags = expand(tags)
213
  predictions = {tag: topics_full[tag] for tag in tags if tag in topics_full}
 
1
  import gradio as gr
 
 
 
 
 
 
2
  import fasttext
3
  from huggingface_hub import hf_hub_download
4
+ from collections import defaultdict
5
 
6
+ from utils.consts import topics, topics_full
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ from models.bayes import NaiveBayesMultiClass, get_tags_bayes
9
+ from models.lstm import LSTMPipeline, get_tags_lstm
10
+ from models.bert import get_bert_pipeline, get_tags_bert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  embedding_model_path = hf_hub_download(
13
  repo_id="facebook/fasttext-en-vectors",
 
15
  )
16
  embedder = fasttext.load_model(embedding_model_path)
17
 
18
+ bert = get_bert_pipeline()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  id2label = {i: topics[i] for i in range(24)}
21
 
 
24
  n_topics = len(topics)
25
 
26
  bayes = NaiveBayesMultiClass(topics)
27
+ bayes.load('weights/bayes/')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
29
 
30
  def expand(tags):
31
  with_primary = set()
 
60
 
61
  def classify_text(classifier, text, threshold, output_format):
62
  if classifier == 'Transformer':
63
+ tags = get_tags_bert(bert, text, threshold)
64
  elif classifier == 'CNN':
65
+ tags = []
66
  elif classifier == 'LSTM':
67
+ tags = get_tags_lstm(lstm, text, threshold)
68
  else:
69
+ tags = get_tags_bayes(bayes, text)
70
 
71
  tags = expand(tags)
72
  predictions = {tag: topics_full[tag] for tag in tags if tag in topics_full}
bayes.py → models/bayes.py RENAMED
@@ -1,44 +1,46 @@
1
- from sklearn.feature_extraction.text import CountVectorizer
2
- from sklearn.naive_bayes import MultinomialNB
3
- from sklearn.preprocessing import MultiLabelBinarizer
4
- import typing
5
- import joblib
6
-
7
-
8
- class NaiveBayesMultiClass(object):
9
- def __init__(self, classes: typing.Iterable[str]):
10
- self.classes = list(classes)
11
- self.n_classes = len(self.classes)
12
- self.enc = MultiLabelBinarizer()
13
- self.enc.fit([classes])
14
- self.vectorizer = CountVectorizer()
15
- self.classifiers = []
16
-
17
- def load(self, path: str):
18
- self.vectorizer = joblib.load(f'{path}/vectorizer.joblib')
19
- self.classifiers = [
20
- joblib.load(f'{path}/class_{i}.joblib') for i in range(self.n_classes)
21
- ]
22
-
23
- def predict(self, X: typing.Iterable[str] | str, get_tags=False):
24
- if type(X) == str:
25
- return self.predict([X], get_tags)[0]
26
- x = self.vectorizer.transform(X)
27
- by_class = [self.classifiers[i].predict(x) for i in range(self.n_classes)]
28
- ans = []
29
-
30
- for i in range(len(X)):
31
- y = []
32
- for j, cls in enumerate(self.classes):
33
- if get_tags:
34
- if by_class[j][i]:
35
- y.append(cls)
36
- else:
37
- y.append(by_class[j][i])
38
- ans.append(y)
39
- return ans
40
-
41
- def __call__(self, *args, **kwargs):
42
- return self.predict(*args, **kwargs)
43
-
44
-
 
 
 
1
+ from sklearn.feature_extraction.text import CountVectorizer
2
+ from sklearn.preprocessing import MultiLabelBinarizer
3
+ import typing
4
+ import joblib
5
+ from utils.preprocessing import clean
6
+
7
+
8
+ class NaiveBayesMultiClass(object):
9
+ def __init__(self, classes: typing.Iterable[str]):
10
+ self.classes = list(classes)
11
+ self.n_classes = len(self.classes)
12
+ self.enc = MultiLabelBinarizer()
13
+ self.enc.fit([classes])
14
+ self.vectorizer = CountVectorizer()
15
+ self.classifiers = []
16
+
17
+ def load(self, path: str):
18
+ self.vectorizer = joblib.load(f'{path}/vectorizer.joblib')
19
+ self.classifiers = [
20
+ joblib.load(f'{path}/class_{i}.joblib') for i in range(self.n_classes)
21
+ ]
22
+
23
+ def predict(self, X: typing.Iterable[str] | str, get_tags=False):
24
+ if type(X) == str:
25
+ return self.predict([X], get_tags)[0]
26
+ x = self.vectorizer.transform(X)
27
+ by_class = [self.classifiers[i].predict(x) for i in range(self.n_classes)]
28
+ ans = []
29
+
30
+ for i in range(len(X)):
31
+ y = []
32
+ for j, cls in enumerate(self.classes):
33
+ if get_tags:
34
+ if by_class[j][i]:
35
+ y.append(cls)
36
+ else:
37
+ y.append(by_class[j][i])
38
+ ans.append(y)
39
+ return ans
40
+
41
+ def __call__(self, *args, **kwargs):
42
+ return self.predict(*args, **kwargs)
43
+
44
+
45
+ def get_tags_bayes(model, text):
46
+ return model.predict(clean(text), True)
models/bert.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from utils.consts import topics
3
+
4
+ def get_bert_pipeline():
5
+ return pipeline(
6
+ "text-classification",
7
+ model="Lxz20071231/igcse-physics-bert",
8
+ tokenizer="distilbert-base-uncased",
9
+ return_all_scores=True,
10
+ function_to_apply="sigmoid",
11
+ truncation=True
12
+ )
13
+
14
+ def get_tags_multiple_bert(model, texts, threshold=0.5):
15
+ probs = model(texts)
16
+ tags = []
17
+ for line in probs:
18
+ found = []
19
+ for p, label in zip(line, topics):
20
+ if p['score'] >= threshold:
21
+ found.append(label)
22
+ tags.append(found)
23
+ return tags
24
+
25
+ def get_tags_bert(model, text, threshold=0.5):
26
+ tags = get_tags_multiple_bert(model, [text], threshold)[0]
27
+ return tags
models/cnn.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ def get_tags_cnn(model, text, threshold=0.5):
2
+ return []
lstm.py → models/lstm.py RENAMED
@@ -1,174 +1,182 @@
1
- from torch.nn.utils.rnn import pad_sequence
2
- import torch
3
- import torch.nn as nn
4
- import torch.nn.functional as F
5
- from torch.nn.utils.rnn import pack_padded_sequence
6
- from types import SimpleNamespace
7
- from huggingface_hub import PyTorchModelHubMixin
8
- from transformers import Pipeline
9
-
10
-
11
- def get_words(model, text: str):
12
- """
13
- Break text into tokens using FastText's internal tokenizer.
14
- """
15
- lines = [model.get_line(line)[0] for line in text.split("\n")]
16
- words = []
17
- for line in lines:
18
- for w in line[:-1]:
19
- words.append(w)
20
- return words
21
-
22
-
23
- def get_vectors(model, text: str):
24
- """
25
- Convert text → list of embedding vectors.
26
- """
27
- words = get_words(model, text)
28
- vectors = [model[w] for w in words]
29
- return vectors
30
-
31
-
32
- def get_tensor(model, text: str):
33
- """
34
- Convert text → (seq_len, embedding_dim) tensor
35
- """
36
- vectors = get_vectors(model, text)
37
- if len(vectors) == 0:
38
- # fallback for empty text
39
- return torch.zeros(1, model.get_dimension())
40
- return torch.tensor(vectors, dtype=torch.float)
41
-
42
-
43
- def preprocess_batch(embedder, texts):
44
- """
45
- Convert a list of text strings into:
46
- x_padded: (batch, seq_len, emb_dim)
47
- lengths: (batch,)
48
- Both sorted by sequence length (DESC) for pack_padded_sequence.
49
- """
50
-
51
- # Convert each text → tensor
52
- seq_tensors = [get_tensor(embedder, t) for t in texts]
53
-
54
- # Compute lengths BEFORE padding
55
- lengths = torch.tensor([seq.size(0) for seq in seq_tensors], dtype=torch.long)
56
-
57
- # Sort by length (DESC)
58
- lengths_sorted, sort_idx = torch.sort(lengths, descending=True)
59
- seq_tensors = [seq_tensors[i] for i in sort_idx]
60
-
61
- # Pad to create (batch, max_seq_len, emb_dim)
62
- x_padded = pad_sequence(seq_tensors, batch_first=True)
63
-
64
- return x_padded, lengths_sorted
65
-
66
-
67
- class LSTMMultiClassClassifier(nn.Module, PyTorchModelHubMixin):
68
- def __init__(self, embedding_dim, hidden_dim, num_classes,
69
- num_layers=1, bidirectional=True, dropout=0.5, **kwargs):
70
- super().__init__()
71
-
72
- # REQUIRED for HuggingFace Pipeline
73
- self.device = torch.device("cpu")
74
-
75
- # Save config
76
- self.config = SimpleNamespace(
77
- embedding_dim=embedding_dim,
78
- hidden_dim=hidden_dim,
79
- num_classes=num_classes,
80
- num_layers=num_layers,
81
- bidirectional=bidirectional,
82
- dropout=dropout
83
- )
84
-
85
- self.embedding_dim = embedding_dim
86
- self.hidden_dim = hidden_dim
87
- self.num_layers = num_layers
88
- self.bidirectional = bidirectional
89
- self.dropout = dropout
90
- self.num_classes = num_classes
91
-
92
- self.lstm = nn.LSTM(
93
- input_size=embedding_dim,
94
- hidden_size=hidden_dim,
95
- num_layers=num_layers,
96
- batch_first=True,
97
- dropout=dropout if num_layers > 1 else 0,
98
- bidirectional=bidirectional
99
- )
100
-
101
- direction = 2 if bidirectional else 1
102
- self.fc = nn.Sequential(
103
- nn.Linear(hidden_dim * direction, 128),
104
- nn.ReLU(),
105
- nn.Linear(128, 128),
106
- nn.ReLU(),
107
- nn.Linear(128, num_classes)
108
- )
109
-
110
- @classmethod
111
- def from_config(cls, config):
112
- return cls(
113
- embedding_dim=config.embedding_dim,
114
- hidden_dim=config.hidden_dim,
115
- num_classes=config.num_classes,
116
- num_layers=config.num_layers,
117
- bidirectional=config.bidirectional,
118
- dropout=config.dropout
119
- )
120
-
121
- # REQUIRED for Transformers Pipeline (updates internal device)
122
- def to(self, device):
123
- super().to(device)
124
- self.device = device
125
- return self
126
-
127
- def forward(self, x, lengths):
128
- x = x.to(self.device)
129
- lengths = lengths.to(self.device)
130
-
131
- packed = pack_padded_sequence(
132
- x, lengths.cpu(), batch_first=True, enforce_sorted=True
133
- )
134
- _, (h_n, _) = self.lstm(packed)
135
-
136
- if self.bidirectional:
137
- h = torch.cat((h_n[-2], h_n[-1]), dim=1)
138
- else:
139
- h = h_n[-1]
140
-
141
- return self.fc(h)
142
-
143
-
144
- class LSTMPipeline(Pipeline):
145
- def __init__(self, id2label, embedder, **kwargs):
146
- model = LSTMMultiClassClassifier.from_pretrained(
147
- "Lxz20071231/igcse-physics-lstm"
148
- )
149
- super().__init__(model=model, tokenizer=None, **kwargs)
150
- self.id2label = id2label
151
- self.embedder = embedder
152
-
153
- def preprocess(self, inputs):
154
- if isinstance(inputs, str):
155
- texts = [inputs]
156
- else:
157
- texts = list(inputs)
158
- x, lengths = preprocess_batch(self.embedder, texts)
159
- return {"x": x, "lengths": lengths}
160
-
161
- def _forward(self, model_inputs):
162
- x = model_inputs["x"]
163
- lengths = model_inputs["lengths"]
164
- with torch.no_grad():
165
- logits = self.model(x, lengths)
166
- return logits
167
-
168
- def postprocess(self, logits):
169
- probs = F.sigmoid(logits)
170
-
171
- return probs
172
-
173
- def _sanitize_parameters(self, **kwargs):
174
- return {}, {}, {}
 
 
 
 
 
 
 
 
 
1
+ from torch.nn.utils.rnn import pad_sequence
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.nn.utils.rnn import pack_padded_sequence
6
+ from types import SimpleNamespace
7
+ from huggingface_hub import PyTorchModelHubMixin
8
+ from transformers import Pipeline
9
+ from utils.consts import topics
10
+
11
+ def get_words(model, text: str):
12
+ """
13
+ Break text into tokens using FastText's internal tokenizer.
14
+ """
15
+ lines = [model.get_line(line)[0] for line in text.split("\n")]
16
+ words = []
17
+ for line in lines:
18
+ for w in line[:-1]:
19
+ words.append(w)
20
+ return words
21
+
22
+
23
+ def get_vectors(model, text: str):
24
+ """
25
+ Convert text → list of embedding vectors.
26
+ """
27
+ words = get_words(model, text)
28
+ vectors = [model[w] for w in words]
29
+ return vectors
30
+
31
+
32
+ def get_tensor(model, text: str):
33
+ """
34
+ Convert text → (seq_len, embedding_dim) tensor
35
+ """
36
+ vectors = get_vectors(model, text)
37
+ if len(vectors) == 0:
38
+ # fallback for empty text
39
+ return torch.zeros(1, model.get_dimension())
40
+ return torch.tensor(vectors, dtype=torch.float)
41
+
42
+
43
+ def preprocess_batch(embedder, texts):
44
+ """
45
+ Convert a list of text strings into:
46
+ x_padded: (batch, seq_len, emb_dim)
47
+ lengths: (batch,)
48
+ Both sorted by sequence length (DESC) for pack_padded_sequence.
49
+ """
50
+
51
+ # Convert each text → tensor
52
+ seq_tensors = [get_tensor(embedder, t) for t in texts]
53
+
54
+ # Compute lengths BEFORE padding
55
+ lengths = torch.tensor([seq.size(0) for seq in seq_tensors], dtype=torch.long)
56
+
57
+ # Sort by length (DESC)
58
+ lengths_sorted, sort_idx = torch.sort(lengths, descending=True)
59
+ seq_tensors = [seq_tensors[i] for i in sort_idx]
60
+
61
+ # Pad to create (batch, max_seq_len, emb_dim)
62
+ x_padded = pad_sequence(seq_tensors, batch_first=True)
63
+
64
+ return x_padded, lengths_sorted
65
+
66
+
67
+ class LSTMMultiClassClassifier(nn.Module, PyTorchModelHubMixin):
68
+ def __init__(self, embedding_dim, hidden_dim, num_classes,
69
+ num_layers=1, bidirectional=True, dropout=0.5, **kwargs):
70
+ super().__init__()
71
+
72
+ # REQUIRED for HuggingFace Pipeline
73
+ self.device = torch.device("cpu")
74
+
75
+ # Save config
76
+ self.config = SimpleNamespace(
77
+ embedding_dim=embedding_dim,
78
+ hidden_dim=hidden_dim,
79
+ num_classes=num_classes,
80
+ num_layers=num_layers,
81
+ bidirectional=bidirectional,
82
+ dropout=dropout
83
+ )
84
+
85
+ self.embedding_dim = embedding_dim
86
+ self.hidden_dim = hidden_dim
87
+ self.num_layers = num_layers
88
+ self.bidirectional = bidirectional
89
+ self.dropout = dropout
90
+ self.num_classes = num_classes
91
+
92
+ self.lstm = nn.LSTM(
93
+ input_size=embedding_dim,
94
+ hidden_size=hidden_dim,
95
+ num_layers=num_layers,
96
+ batch_first=True,
97
+ dropout=dropout if num_layers > 1 else 0,
98
+ bidirectional=bidirectional
99
+ )
100
+
101
+ direction = 2 if bidirectional else 1
102
+ self.fc = nn.Sequential(
103
+ nn.Linear(hidden_dim * direction, 128),
104
+ nn.ReLU(),
105
+ nn.Linear(128, 128),
106
+ nn.ReLU(),
107
+ nn.Linear(128, num_classes)
108
+ )
109
+
110
+ @classmethod
111
+ def from_config(cls, config):
112
+ return cls(
113
+ embedding_dim=config.embedding_dim,
114
+ hidden_dim=config.hidden_dim,
115
+ num_classes=config.num_classes,
116
+ num_layers=config.num_layers,
117
+ bidirectional=config.bidirectional,
118
+ dropout=config.dropout
119
+ )
120
+
121
+ # REQUIRED for Transformers Pipeline (updates internal device)
122
+ def to(self, device):
123
+ super().to(device)
124
+ self.device = device
125
+ return self
126
+
127
+ def forward(self, x, lengths):
128
+ x = x.to(self.device)
129
+ lengths = lengths.to(self.device)
130
+
131
+ packed = pack_padded_sequence(
132
+ x, lengths.cpu(), batch_first=True, enforce_sorted=True
133
+ )
134
+ _, (h_n, _) = self.lstm(packed)
135
+
136
+ if self.bidirectional:
137
+ h = torch.cat((h_n[-2], h_n[-1]), dim=1)
138
+ else:
139
+ h = h_n[-1]
140
+
141
+ return self.fc(h)
142
+
143
+
144
+ class LSTMPipeline(Pipeline):
145
+ def __init__(self, id2label, embedder, **kwargs):
146
+ model = LSTMMultiClassClassifier.from_pretrained(
147
+ "Lxz20071231/igcse-physics-lstm"
148
+ )
149
+ super().__init__(model=model, tokenizer=None, **kwargs)
150
+ self.id2label = id2label
151
+ self.embedder = embedder
152
+
153
+ def preprocess(self, inputs):
154
+ if isinstance(inputs, str):
155
+ texts = [inputs]
156
+ else:
157
+ texts = list(inputs)
158
+ x, lengths = preprocess_batch(self.embedder, texts)
159
+ return {"x": x, "lengths": lengths}
160
+
161
+ def _forward(self, model_inputs):
162
+ x = model_inputs["x"]
163
+ lengths = model_inputs["lengths"]
164
+ with torch.no_grad():
165
+ logits = self.model(x, lengths)
166
+ return logits
167
+
168
+ def postprocess(self, logits):
169
+ probs = F.sigmoid(logits)
170
+
171
+ return probs
172
+
173
+ def _sanitize_parameters(self, **kwargs):
174
+ return {}, {}, {}
175
+
176
+ def get_tags_lstm(model, text, threshold=0.5):
177
+ probs = model(text)[0]
178
+ tags = []
179
+ for p, label in zip(probs, topics):
180
+ if p >= threshold:
181
+ tags.append(label)
182
+ return tags
utils/consts.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ topics = [
2
+ '1.1',
3
+ '1.2',
4
+ '1.3',
5
+ '1.4',
6
+ '1.5',
7
+ '1.6',
8
+ '1.7',
9
+ '1.8',
10
+ '2.1',
11
+ '2.2',
12
+ '2.3',
13
+ '3.1',
14
+ '3.2',
15
+ '3.3',
16
+ '3.4',
17
+ '4.1',
18
+ '4.2',
19
+ '4.3',
20
+ '4.4',
21
+ '4.5',
22
+ '5.1',
23
+ '5.2',
24
+ '6.1',
25
+ '6.2',
26
+ ]
27
+
28
+ topics_full = {
29
+ '1': 'Motion, forces and energy',
30
+ '1.1': 'Physical quantities and measurement techniques',
31
+ '1.2': 'Motion',
32
+ '1.3': 'Mass and weight',
33
+ '1.4': 'Density',
34
+ '1.5': 'Forces',
35
+ '1.6': 'Momentum',
36
+ '1.7': 'Energy, work and power',
37
+ '1.8': 'Pressure',
38
+ '2': 'Thermal physics',
39
+ '2.1': 'Kinetic particle model of matter',
40
+ '2.2': 'Thermal properties and temperature',
41
+ '2.3': 'Transfer of thermal energy',
42
+ '3': 'Waves',
43
+ '3.1': 'General properties of waves',
44
+ '3.2': 'Light',
45
+ '3.3': 'Electromagnetic spectrum',
46
+ '3.4': 'Sound',
47
+ '4': 'Electricity and magnetism',
48
+ '4.1': 'Simple phenomena of magnetism',
49
+ '4.2': 'Electrical quantities',
50
+ '4.3': 'Electric circuits',
51
+ '4.4': 'Electrical safety',
52
+ '4.5': 'Electromagnetic effects',
53
+ '5': 'Nuclear physics',
54
+ '5.1': 'The nuclear model of the atom',
55
+ '5.2': 'Radioactivity',
56
+ '6': 'Space physics',
57
+ '6.1': 'Earth and the Solar System',
58
+ '6.2': 'Stars and the Universe',
59
+ }
utils/preprocessing.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from textblob import TextBlob
4
+
5
+ stopword = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
6
+ "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they",
7
+ "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
8
+ "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
9
+ "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at",
10
+ "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
11
+ "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then",
12
+ "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
13
+ "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
14
+ "can", "will", "just", "don", "should", "now"]
15
+ punctuations = string.punctuation
16
+
17
+ def to_lower(text: str) -> str:
18
+ return text.lower()
19
+
20
+
21
+ def remove_html_tags(text: str) -> str:
22
+ pattern = re.compile('<.*?>')
23
+ return pattern.sub(r'', text)
24
+
25
+
26
+ def remove_punctuations(text: str) -> str:
27
+ return text.translate(str.maketrans('', '', punctuations))
28
+
29
+
30
+ def correct_spellings(text: str) -> str:
31
+ return TextBlob(text).correct().string
32
+
33
+
34
+ def remove_stopwords(text: str) -> str:
35
+ return " ".join([word for word in text.split() if word not in stopword])
36
+
37
+
38
+ def clean(text: str) -> str:
39
+ return remove_stopwords(
40
+ correct_spellings(remove_punctuations(remove_html_tags(to_lower(text))))
41
+ )
{bayes → weights/bayes}/class_0.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_1.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_10.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_11.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_12.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_13.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_14.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_15.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_16.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_17.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_18.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_19.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_2.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_20.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_21.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_22.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_23.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_3.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_4.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_5.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_6.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_7.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_8.joblib RENAMED
File without changes
{bayes → weights/bayes}/class_9.joblib RENAMED
File without changes
{bayes → weights/bayes}/vectorizer.joblib RENAMED
File without changes