hugpv commited on
Commit
8789d72
·
1 Parent(s): 7af3e73

sync with dev version

Browse files
Files changed (6) hide show
  1. analysis_funcs.py +44 -25
  2. app.py +0 -0
  3. classic_correction_algos.py +1 -1
  4. multi_proc_funcs.py +121 -45
  5. popEye_funcs.py +58 -12
  6. utils.py +51 -42
analysis_funcs.py CHANGED
@@ -29,11 +29,19 @@ def fix_in_ia_default(fixation, ia_row, prefix):
29
  )
30
 
31
 
32
- def number_of_fixations_own(trial, dffix, prefix, correction_algo):
 
 
 
 
 
 
 
 
33
  """
34
  Return the number of fixations on that interest area.
35
  """
36
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
37
  counts = []
38
  for cidx, ia_row in ia_df.iterrows():
39
  count = 0
@@ -57,11 +65,11 @@ def number_of_fixations_own(trial, dffix, prefix, correction_algo):
57
  return pd.DataFrame(counts)
58
 
59
 
60
- def initial_fixation_duration_own(trial, dffix, prefix, correction_algo):
61
  """
62
  The duration of the initial fixation on that interest area for each word.
63
  """
64
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
65
  durations = []
66
 
67
  for cidx, ia_row in ia_df.iterrows():
@@ -81,8 +89,8 @@ def initial_fixation_duration_own(trial, dffix, prefix, correction_algo):
81
  return pd.DataFrame(durations)
82
 
83
 
84
- def first_of_many_duration_own(trial, dffix, prefix, correction_algo):
85
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
86
  durations = []
87
  for cidx, ia_row in ia_df.iterrows():
88
  fixation_durations = []
@@ -111,11 +119,11 @@ def first_of_many_duration_own(trial, dffix, prefix, correction_algo):
111
  return pd.DataFrame()
112
 
113
 
114
- def total_fixation_duration_own(trial, dffix, prefix, correction_algo):
115
  """
116
  sum duration of all fixations on that interest area.
117
  """
118
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
119
  durations = []
120
  for cidx, ia_row in ia_df.iterrows():
121
  total_duration = 0
@@ -132,12 +140,12 @@ def total_fixation_duration_own(trial, dffix, prefix, correction_algo):
132
  return pd.DataFrame(durations)
133
 
134
 
135
- def gaze_duration_own(trial, dffix, prefix, correction_algo):
136
  """
137
  Gaze duration is the sum duration of all fixations
138
  inside an interest area until the area is exited for the first time.
139
  """
140
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
141
  durations = []
142
  for cidx, ia_row in ia_df.iterrows():
143
  duration = 0
@@ -158,7 +166,7 @@ def gaze_duration_own(trial, dffix, prefix, correction_algo):
158
  return pd.DataFrame(durations)
159
 
160
 
161
- def go_past_duration_own(trial, dffix, prefix, correction_algo):
162
  """
163
  Given an interest area and fixation sequence, return the go-past time on
164
  that interest area. Go-past time is the sum duration of all fixations from
@@ -166,7 +174,7 @@ def go_past_duration_own(trial, dffix, prefix, correction_algo):
166
  the right, including any regressions to the left that occur during that
167
  time period (and vice versa in the case of right-to-left text).
168
  """
169
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
170
  results = []
171
 
172
  for cidx, ia_row in ia_df.iterrows():
@@ -190,12 +198,12 @@ def go_past_duration_own(trial, dffix, prefix, correction_algo):
190
  return pd.DataFrame(results)
191
 
192
 
193
- def second_pass_duration_own(trial, dffix, prefix, correction_algo):
194
  """
195
  Given an interest area and fixation sequence, return the second pass
196
  duration on that interest area for each word.
197
  """
198
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
199
  durations = []
200
 
201
  for cidx, ia_row in ia_df.iterrows():
@@ -224,13 +232,13 @@ def second_pass_duration_own(trial, dffix, prefix, correction_algo):
224
  return pd.DataFrame(durations)
225
 
226
 
227
- def initial_landing_position_own(trial, dffix, prefix, correction_algo):
228
  """
229
  initial landing position (expressed in character positions) on that interest area.
230
  Counting is from 1. Returns `None` if no fixation
231
  landed on the interest area.
232
  """
233
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
234
  if prefix == "word":
235
  chars_df = pd.DataFrame(trial[f"chars_list"])
236
  else:
@@ -266,7 +274,7 @@ def initial_landing_position_own(trial, dffix, prefix, correction_algo):
266
  return pd.DataFrame(results)
267
 
268
 
269
- def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
270
  """
271
  Given an interest area and fixation sequence, return the initial landing
272
  distance on that interest area. The initial landing distance is the pixel
@@ -276,7 +284,7 @@ def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
276
  without including any padding. Returns `None` if no fixation landed on the
277
  interest area.
278
  """
279
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
280
  distances = []
281
  for cidx, ia_row in ia_df.iterrows():
282
  initial_distance = None
@@ -296,12 +304,12 @@ def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
296
  return pd.DataFrame(distances)
297
 
298
 
299
- def landing_distances_own(trial, dffix, prefix, correction_algo):
300
  """
301
  Given an interest area and fixation sequence, return a dataframe with
302
  landing distances for each word in the interest area.
303
  """
304
- ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
305
  distances = []
306
  for cidx, ia_row in ia_df.iterrows():
307
  landing_distances = []
@@ -319,11 +327,15 @@ def landing_distances_own(trial, dffix, prefix, correction_algo):
319
  return pd.DataFrame(distances)
320
 
321
 
322
- def number_of_regressions_in_own(trial, dffix, prefix, correction_algo):
 
 
 
 
 
323
  word_reg_in_count = (
324
- dffix.groupby([f"on_{prefix}_number_{correction_algo}", f"on_{prefix}_{correction_algo}"])[
325
- f"{prefix}_reg_in_{correction_algo}"
326
- ]
327
  .sum()
328
  .reset_index()
329
  .rename(
@@ -335,4 +347,11 @@ def number_of_regressions_in_own(trial, dffix, prefix, correction_algo):
335
  )
336
  )
337
 
338
- return word_reg_in_count
 
 
 
 
 
 
 
 
29
  )
30
 
31
 
32
+ def _get_interest_area_df(trial, prefix, ia_df=None):
33
+ """Return the interest-area dataframe, reusing a provided instance when available."""
34
+
35
+ if ia_df is not None:
36
+ return ia_df
37
+ return pd.DataFrame(trial[f"{prefix}s_list"])
38
+
39
+
40
+ def number_of_fixations_own(trial, dffix, prefix, correction_algo, ia_df=None):
41
  """
42
  Return the number of fixations on that interest area.
43
  """
44
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
45
  counts = []
46
  for cidx, ia_row in ia_df.iterrows():
47
  count = 0
 
65
  return pd.DataFrame(counts)
66
 
67
 
68
+ def initial_fixation_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
69
  """
70
  The duration of the initial fixation on that interest area for each word.
71
  """
72
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
73
  durations = []
74
 
75
  for cidx, ia_row in ia_df.iterrows():
 
89
  return pd.DataFrame(durations)
90
 
91
 
92
+ def first_of_many_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
93
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
94
  durations = []
95
  for cidx, ia_row in ia_df.iterrows():
96
  fixation_durations = []
 
119
  return pd.DataFrame()
120
 
121
 
122
+ def total_fixation_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
123
  """
124
  sum duration of all fixations on that interest area.
125
  """
126
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
127
  durations = []
128
  for cidx, ia_row in ia_df.iterrows():
129
  total_duration = 0
 
140
  return pd.DataFrame(durations)
141
 
142
 
143
+ def gaze_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
144
  """
145
  Gaze duration is the sum duration of all fixations
146
  inside an interest area until the area is exited for the first time.
147
  """
148
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
149
  durations = []
150
  for cidx, ia_row in ia_df.iterrows():
151
  duration = 0
 
166
  return pd.DataFrame(durations)
167
 
168
 
169
+ def go_past_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
170
  """
171
  Given an interest area and fixation sequence, return the go-past time on
172
  that interest area. Go-past time is the sum duration of all fixations from
 
174
  the right, including any regressions to the left that occur during that
175
  time period (and vice versa in the case of right-to-left text).
176
  """
177
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
178
  results = []
179
 
180
  for cidx, ia_row in ia_df.iterrows():
 
198
  return pd.DataFrame(results)
199
 
200
 
201
+ def second_pass_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
202
  """
203
  Given an interest area and fixation sequence, return the second pass
204
  duration on that interest area for each word.
205
  """
206
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
207
  durations = []
208
 
209
  for cidx, ia_row in ia_df.iterrows():
 
232
  return pd.DataFrame(durations)
233
 
234
 
235
+ def initial_landing_position_own(trial, dffix, prefix, correction_algo, ia_df=None):
236
  """
237
  initial landing position (expressed in character positions) on that interest area.
238
  Counting is from 1. Returns `None` if no fixation
239
  landed on the interest area.
240
  """
241
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
242
  if prefix == "word":
243
  chars_df = pd.DataFrame(trial[f"chars_list"])
244
  else:
 
274
  return pd.DataFrame(results)
275
 
276
 
277
+ def initial_landing_distance_own(trial, dffix, prefix, correction_algo, ia_df=None):
278
  """
279
  Given an interest area and fixation sequence, return the initial landing
280
  distance on that interest area. The initial landing distance is the pixel
 
284
  without including any padding. Returns `None` if no fixation landed on the
285
  interest area.
286
  """
287
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
288
  distances = []
289
  for cidx, ia_row in ia_df.iterrows():
290
  initial_distance = None
 
304
  return pd.DataFrame(distances)
305
 
306
 
307
+ def landing_distances_own(trial, dffix, prefix, correction_algo, ia_df=None):
308
  """
309
  Given an interest area and fixation sequence, return a dataframe with
310
  landing distances for each word in the interest area.
311
  """
312
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
313
  distances = []
314
  for cidx, ia_row in ia_df.iterrows():
315
  landing_distances = []
 
327
  return pd.DataFrame(distances)
328
 
329
 
330
+ def number_of_regressions_in_own(trial, dffix, prefix, correction_algo, ia_df=None):
331
+ ia_df = _get_interest_area_df(trial, prefix, ia_df)
332
+
333
+ if f"{prefix}_number" not in ia_df.columns:
334
+ ia_df = ia_df.reset_index().rename(columns={"index": f"{prefix}_number"})
335
+
336
  word_reg_in_count = (
337
+ dffix.groupby([f"on_{prefix}_number_{correction_algo}", f"on_{prefix}_{correction_algo}"])
338
+ [f"{prefix}_reg_in_{correction_algo}"]
 
339
  .sum()
340
  .reset_index()
341
  .rename(
 
347
  )
348
  )
349
 
350
+ merged = ia_df[[f"{prefix}_number", prefix]].merge(
351
+ word_reg_in_count, how="left", on=[f"{prefix}_number", prefix]
352
+ )
353
+ merged[f"number_of_regressions_in_{correction_algo}"] = (
354
+ merged[f"number_of_regressions_in_{correction_algo}"].fillna(0).astype(int).tolist()
355
+ )
356
+
357
+ return merged
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
classic_correction_algos.py CHANGED
@@ -58,7 +58,7 @@ def apply_classic_algo(
58
  corrected_fix_y_vals = compare(fixation_array, np.array(word_center_list), **algo_params)
59
  else:
60
  raise NotImplementedError(f"{algo} not implemented")
61
- corrected_fix_y_vals = np.round(corrected_fix_y_vals, decimals=2)
62
  corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_fix_y_vals]
63
  dffix[f"y_{algo}"] = corrected_fix_y_vals
64
  dffix[f"line_num_{algo}"] = corrected_line_nums
 
58
  corrected_fix_y_vals = compare(fixation_array, np.array(word_center_list), **algo_params)
59
  else:
60
  raise NotImplementedError(f"{algo} not implemented")
61
+ corrected_fix_y_vals = np.round(corrected_fix_y_vals, decimals=0).astype(int).tolist()
62
  corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_fix_y_vals]
63
  dffix[f"y_{algo}"] = corrected_fix_y_vals
64
  dffix[f"line_num_{algo}"] = corrected_line_nums
multi_proc_funcs.py CHANGED
@@ -16,6 +16,7 @@ from matplotlib.font_manager import FontProperties
16
  from matplotlib.patches import Rectangle
17
  from tqdm.auto import tqdm
18
  import torch as t
 
19
  t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
20
  import plotly.express as px
21
  import copy
@@ -31,6 +32,29 @@ torch.multiprocessing.set_sharing_strategy('file_system') # Needed to make multi
31
 
32
  ic.configureOutput(includeContext=True)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  PLOTS_FOLDER = pl.Path("plots")
35
  event_strs = [
36
  "EFIX",
@@ -970,7 +994,7 @@ def get_raw_events_df_and_trial(trial, discard_fixations_without_sfix):
970
 
971
  if "index" not in chars_df.columns:
972
  chars_df.reset_index(inplace=True)
973
- trial["chars_df"] = chars_df.to_dict()
974
  trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
975
  return reorder_columns(events_df), trial
976
 
@@ -1429,7 +1453,7 @@ def get_all_measures(
1429
  for measure in measures_to_calculate:
1430
  if hasattr(anf, f"{measure}_own"):
1431
  function = getattr(anf, f"{measure}_own")
1432
- result = function(trial, dffix_copy, prefix, correction_algo)
1433
  res_dfs.append(result)
1434
  dfs_list = [df for df in [stim_df] + res_dfs if not df.empty]
1435
  own_measure_df = stim_df
@@ -1478,6 +1502,12 @@ def get_all_measures(
1478
  own_measure_df.to_csv(
1479
  RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_{correction_algo}_word_measures.csv"
1480
  )
 
 
 
 
 
 
1481
  return own_measure_df
1482
 
1483
 
@@ -1838,7 +1868,7 @@ def get_DIST_preds(dffix, trial, models_dict):
1838
  y_pred_DIST = [y_char_unique[idx] for idx in preds]
1839
 
1840
  dffix[f"line_num_{algo_choice}"] = preds
1841
- dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=2)
1842
  dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(2)
1843
  except Exception as e:
1844
  ic(f"Exception on model(batch) for DIST \n{e}")
@@ -1868,7 +1898,7 @@ def get_DIST_ensemble_preds(
1868
  y_pred_DIST = [y_char_unique[idx] for idx in preds]
1869
 
1870
  dffix[f"line_num_{algo_choice}"] = preds
1871
- dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=1)
1872
  dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
1873
  return dffix
1874
 
@@ -1989,7 +2019,7 @@ def add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, xcol, cols_to_
1989
  dffix["angle_outgoing"] = angle_outgoing
1990
  dffix[f"line_change_{algo_choice}"] = np.concatenate(
1991
  ([0], np.diff(dffix[f"line_num_{algo_choice}"])), axis=0
1992
- ).astype(int)
1993
 
1994
  for i in list(dffix.index):
1995
  if dffix.loc[i, f"line_num_{algo_choice}"] > -1 and not pd.isna(dffix.loc[i, f"line_num_{algo_choice}"]):
@@ -2161,10 +2191,20 @@ def correct_df(
2161
  algo_choices = [algo_choice]
2162
  repeats = range(1)
2163
 
2164
- chars_df = pd.DataFrame(trial["chars_df"]) if "chars_df" in trial else pd.DataFrame(trial["chars_list"])
 
 
 
 
 
 
 
 
 
 
2165
  if for_multi:
2166
  own_word_measures_dfs_for_algo = []
2167
- own_sentence_measures_dfs_for_algo = []
2168
  trial["average_y_corrections"] = []
2169
  for algoIdx in stqdm(repeats, desc="Applying line-assignment algorithms"):
2170
  algo_choice = algo_choices[algoIdx]
@@ -2180,7 +2220,11 @@ def correct_df(
2180
  fix_to_plot=["Uncorrected Fixations", "Corrected Fixations"],
2181
  stim_info_to_plot=["Characters", "Word boxes"],
2182
  )
2183
- savename = f"{trial['subject']}_{trial['trial_id']}_corr_{algo_choice}_fix.png"
 
 
 
 
2184
  fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300)
2185
  plt.close(fig)
2186
  dffix = add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, "x", cols_to_add=fix_cols_to_add)
@@ -2195,15 +2239,16 @@ def correct_df(
2195
  measures_to_calculate=measures_to_calculate_multi_asc,
2196
  include_coords=include_coords_multi_asc,
2197
  )
 
 
2198
  own_word_measures_dfs_for_algo.append(own_word_measures)
2199
- sent_measures_multi = pf.compute_sentence_measures(
2200
- dffix, pd.DataFrame(trial["chars_df"]), algo_choice, sent_measures_to_calc_multi
2201
- )
2202
- own_sentence_measures_dfs_for_algo.append(sent_measures_multi)
2203
 
2204
  if for_multi and len(own_word_measures_dfs_for_algo) > 0:
2205
  words_df = (
2206
- pd.DataFrame(trial["chars_df"])
2207
  .drop_duplicates(subset="in_word_number", keep="first")
2208
  .loc[:, ["in_word_number", "in_word"]]
2209
  .rename({"in_word_number": "word_number", "in_word": "word"}, axis=1)
@@ -2222,41 +2267,53 @@ def correct_df(
2222
  )
2223
  words_df = reorder_columns(words_df, ["subject", "trial_id", "item", "condition", "word_number", "word"])
2224
 
2225
- sentence_df = (
2226
- pd.DataFrame(trial["chars_df"])
2227
- .drop_duplicates(subset="in_sentence_number", keep="first")
2228
- .loc[
2229
- :,
2230
- [
2231
- "in_sentence_number",
2232
- "in_sentence",
2233
- ],
 
 
 
 
 
 
 
 
 
2234
  ]
2235
- .rename({"in_sentence_number": "sentence_number", "in_sentence": "sentence"}, axis=1)
2236
- .reset_index(drop=True)
2237
- )
2238
- add_cols_from_trial(trial, sentence_df, cols=["item", "condition", "trial_id", "subject"])
2239
- sentence_df["subject_trialID"] = [
2240
- f"{id}_{num}" for id, num in zip(sentence_df["subject"], sentence_df["trial_id"])
2241
- ]
2242
- sentence_df = sentence_df.merge(
2243
- own_sentence_measures_dfs_for_algo[0],
2244
- how="left",
2245
- on=["item", "condition", "trial_id", "subject", "sentence_number", "sentence"],
2246
- )
2247
- for sent_measure_df in own_sentence_measures_dfs_for_algo[1:]:
2248
  sentence_df = sentence_df.merge(
2249
- sent_measure_df,
2250
  how="left",
2251
- on=["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2252
  )
2253
- sentence_df = reorder_columns(
2254
- sentence_df, ["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"]
2255
- )
2256
 
2257
  trial["own_word_measures_dfs_for_algo"] = words_df
2258
 
2259
- trial["own_sentence_measures_dfs_for_algo"] = sentence_df
 
2260
  dffix = reorder_columns(dffix)
2261
  if for_multi:
2262
  return dffix
@@ -2313,10 +2370,25 @@ def process_trial_choice(
2313
  merge_distance_threshold=merge_distance_threshold,
2314
  discard_blinks=discard_blinks,
2315
  )
 
 
 
 
 
2316
  if "chars_list" in trial:
2317
- chars_df = pd.DataFrame(trial["chars_df"])
 
 
 
 
 
 
 
 
 
 
2318
 
2319
- trial["chars_df"] = chars_df.to_dict()
2320
  trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
2321
  if algo_choice is not None and ("chars_list" in trial or "words_list" in trial):
2322
  if dffix.shape[0] > 1:
@@ -2335,10 +2407,14 @@ def process_trial_choice(
2335
  )
2336
 
2337
  saccade_df = get_saccade_df(dffix, trial, algo_choice, trial.pop("events_df"))
2338
- trial["saccade_df"] = saccade_df.to_dict()
2339
 
2340
  fig = plot_saccade_df(dffix, saccade_df, trial, True, False)
2341
- fig.savefig(RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_saccades.png")
 
 
 
 
2342
  plt.close(fig)
2343
  else:
2344
  ic(
 
16
  from matplotlib.patches import Rectangle
17
  from tqdm.auto import tqdm
18
  import torch as t
19
+ import logging
20
  t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
21
  import plotly.express as px
22
  import copy
 
32
 
33
  ic.configureOutput(includeContext=True)
34
 
35
+ LOGGER = logging.getLogger(__name__)
36
+
37
+ def ensure_dataframe(obj, *, label="data", context: str | None = None):
38
+ if isinstance(obj, pd.DataFrame):
39
+ return obj
40
+ if obj is None:
41
+ parts = [f"{label} is missing"]
42
+ if context:
43
+ parts.append(f"({context})")
44
+ message = " ".join(parts) + "."
45
+ LOGGER.error(message)
46
+ raise ValueError(message)
47
+ return pd.DataFrame(obj)
48
+
49
+
50
+ def _make_hashable(value):
51
+ if isinstance(value, list):
52
+ return tuple(_make_hashable(v) for v in value)
53
+ if isinstance(value, dict):
54
+ return tuple(sorted((k, _make_hashable(v)) for k, v in value.items()))
55
+ return value
56
+
57
+
58
  PLOTS_FOLDER = pl.Path("plots")
59
  event_strs = [
60
  "EFIX",
 
994
 
995
  if "index" not in chars_df.columns:
996
  chars_df.reset_index(inplace=True)
997
+ trial["chars_df"] = chars_df
998
  trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
999
  return reorder_columns(events_df), trial
1000
 
 
1453
  for measure in measures_to_calculate:
1454
  if hasattr(anf, f"{measure}_own"):
1455
  function = getattr(anf, f"{measure}_own")
1456
+ result = function(trial, dffix_copy, prefix, correction_algo, stim_df)
1457
  res_dfs.append(result)
1458
  dfs_list = [df for df in [stim_df] + res_dfs if not df.empty]
1459
  own_measure_df = stim_df
 
1502
  own_measure_df.to_csv(
1503
  RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_{correction_algo}_word_measures.csv"
1504
  )
1505
+
1506
+ object_cols = own_measure_df.select_dtypes(include="object").columns
1507
+ if len(object_cols) > 0:
1508
+ for col in object_cols:
1509
+ own_measure_df[col] = own_measure_df[col].map(_make_hashable)
1510
+
1511
  return own_measure_df
1512
 
1513
 
 
1868
  y_pred_DIST = [y_char_unique[idx] for idx in preds]
1869
 
1870
  dffix[f"line_num_{algo_choice}"] = preds
1871
+ dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=0).astype(int).tolist()
1872
  dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(2)
1873
  except Exception as e:
1874
  ic(f"Exception on model(batch) for DIST \n{e}")
 
1898
  y_pred_DIST = [y_char_unique[idx] for idx in preds]
1899
 
1900
  dffix[f"line_num_{algo_choice}"] = preds
1901
+ dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=0).astype(int).tolist()
1902
  dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
1903
  return dffix
1904
 
 
2019
  dffix["angle_outgoing"] = angle_outgoing
2020
  dffix[f"line_change_{algo_choice}"] = np.concatenate(
2021
  ([0], np.diff(dffix[f"line_num_{algo_choice}"])), axis=0
2022
+ ).astype(int).tolist()
2023
 
2024
  for i in list(dffix.index):
2025
  if dffix.loc[i, f"line_num_{algo_choice}"] > -1 and not pd.isna(dffix.loc[i, f"line_num_{algo_choice}"]):
 
2191
  algo_choices = [algo_choice]
2192
  repeats = range(1)
2193
 
2194
+ chars_df_source = trial.get("chars_df")
2195
+ if chars_df_source is None:
2196
+ chars_df_source = trial.get("chars_list")
2197
+ trial_context = f"trial {trial.get('trial_id', 'unknown')}"
2198
+ if trial.get("subject"):
2199
+ trial_context = f"{trial.get('subject')} / {trial_context}"
2200
+ chars_df = ensure_dataframe(
2201
+ chars_df_source,
2202
+ label="Character stimulus data",
2203
+ context=trial_context,
2204
+ )
2205
  if for_multi:
2206
  own_word_measures_dfs_for_algo = []
2207
+ own_sentence_measures_dfs_for_algo = []
2208
  trial["average_y_corrections"] = []
2209
  for algoIdx in stqdm(repeats, desc="Applying line-assignment algorithms"):
2210
  algo_choice = algo_choices[algoIdx]
 
2220
  fix_to_plot=["Uncorrected Fixations", "Corrected Fixations"],
2221
  stim_info_to_plot=["Characters", "Word boxes"],
2222
  )
2223
+ subject_for_name = trial.get("subject") or (
2224
+ pl.Path(trial["filename"]).stem if trial.get("filename") else "unknown_subject"
2225
+ )
2226
+ trial_id_for_name = trial.get("trial_id") or "unknown_trial"
2227
+ savename = f"{subject_for_name}_{trial_id_for_name}_corr_{algo_choice}_fix.png"
2228
  fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300)
2229
  plt.close(fig)
2230
  dffix = add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, "x", cols_to_add=fix_cols_to_add)
 
2239
  measures_to_calculate=measures_to_calculate_multi_asc,
2240
  include_coords=include_coords_multi_asc,
2241
  )
2242
+ if 'item' not in own_word_measures.columns:
2243
+ add_cols_from_trial(trial, own_word_measures, cols=["item", "condition", "trial_id", "subject"])
2244
  own_word_measures_dfs_for_algo.append(own_word_measures)
2245
+ if sent_measures_to_calc_multi:
2246
+ sent_measures_multi = pf.compute_sentence_measures(dffix, chars_df, algo_choice, sent_measures_to_calc_multi)
2247
+ own_sentence_measures_dfs_for_algo.append(sent_measures_multi)
 
2248
 
2249
  if for_multi and len(own_word_measures_dfs_for_algo) > 0:
2250
  words_df = (
2251
+ chars_df
2252
  .drop_duplicates(subset="in_word_number", keep="first")
2253
  .loc[:, ["in_word_number", "in_word"]]
2254
  .rename({"in_word_number": "word_number", "in_word": "word"}, axis=1)
 
2267
  )
2268
  words_df = reorder_columns(words_df, ["subject", "trial_id", "item", "condition", "word_number", "word"])
2269
 
2270
+ sentence_df = None
2271
+ if own_sentence_measures_dfs_for_algo:
2272
+ sentence_df = (
2273
+ chars_df
2274
+ .drop_duplicates(subset="in_sentence_number", keep="first")
2275
+ .loc[
2276
+ :,
2277
+ [
2278
+ "in_sentence_number",
2279
+ "in_sentence",
2280
+ ],
2281
+ ]
2282
+ .rename({"in_sentence_number": "sentence_number", "in_sentence": "sentence"}, axis=1)
2283
+ .reset_index(drop=True)
2284
+ )
2285
+ add_cols_from_trial(trial, sentence_df, cols=["item", "condition", "trial_id", "subject"])
2286
+ sentence_df["subject_trialID"] = [
2287
+ f"{id}_{num}" for id, num in zip(sentence_df["subject"], sentence_df["trial_id"])
2288
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
2289
  sentence_df = sentence_df.merge(
2290
+ own_sentence_measures_dfs_for_algo[0],
2291
  how="left",
2292
+ on=["item", "condition", "trial_id", "subject", "sentence_number", "sentence"],
2293
+ )
2294
+ for sent_measure_df in own_sentence_measures_dfs_for_algo[1:]:
2295
+ sentence_df = sentence_df.merge(
2296
+ sent_measure_df,
2297
+ how="left",
2298
+ on=[
2299
+ "subject",
2300
+ "trial_id",
2301
+ "item",
2302
+ "condition",
2303
+ "sentence_number",
2304
+ "sentence",
2305
+ "number_of_words",
2306
+ ],
2307
+ )
2308
+ sentence_df = reorder_columns(
2309
+ sentence_df,
2310
+ ["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"],
2311
  )
 
 
 
2312
 
2313
  trial["own_word_measures_dfs_for_algo"] = words_df
2314
 
2315
+ if sentence_df is not None:
2316
+ trial["own_sentence_measures_dfs_for_algo"] = sentence_df
2317
  dffix = reorder_columns(dffix)
2318
  if for_multi:
2319
  return dffix
 
2370
  merge_distance_threshold=merge_distance_threshold,
2371
  discard_blinks=discard_blinks,
2372
  )
2373
+ if not trial.get("subject"):
2374
+ filename = trial.get("filename")
2375
+ if filename:
2376
+ trial["subject"] = pl.Path(filename).stem
2377
+
2378
  if "chars_list" in trial:
2379
+ chars_df_source = trial.get("chars_df")
2380
+ if chars_df_source is None:
2381
+ chars_df_source = trial.get("chars_list")
2382
+ trial_context = f"trial {trial.get('trial_id', 'unknown')}"
2383
+ if trial.get("subject"):
2384
+ trial_context = f"{trial.get('subject')} / {trial_context}"
2385
+ chars_df = ensure_dataframe(
2386
+ chars_df_source,
2387
+ label="Character stimulus data",
2388
+ context=trial_context,
2389
+ )
2390
 
2391
+ trial["chars_df"] = chars_df
2392
  trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
2393
  if algo_choice is not None and ("chars_list" in trial or "words_list" in trial):
2394
  if dffix.shape[0] > 1:
 
2407
  )
2408
 
2409
  saccade_df = get_saccade_df(dffix, trial, algo_choice, trial.pop("events_df"))
2410
+ trial["saccade_df"] = saccade_df
2411
 
2412
  fig = plot_saccade_df(dffix, saccade_df, trial, True, False)
2413
+ subject_for_name = trial.get("subject") or (
2414
+ pl.Path(trial["filename"]).stem if trial.get("filename") else "unknown_subject"
2415
+ )
2416
+ trial_id_for_name = trial.get("trial_id") or "unknown_trial"
2417
+ fig.savefig(RESULTS_FOLDER / f"{subject_for_name}_{trial_id_for_name}_saccades.png")
2418
  plt.close(fig)
2419
  else:
2420
  ic(
popEye_funcs.py CHANGED
@@ -1258,16 +1258,26 @@ def aggregate_trials(dffix_combined, wordcomb, all_trials_by_subj, algo_choices)
1258
  trial = trial[names].copy()
1259
 
1260
  for index, row in trial.iterrows():
 
 
 
 
 
 
 
 
1261
  selected_trial = all_trials_by_subj[row["subject"]][row["trial_id"]]
1262
  info_keys = [
1263
  k for k in selected_trial.keys() if k in ["trial_start_time", "trial_end_time", "question_correct"]
1264
  ]
1265
- if row["subject"] in all_trials_by_subj and row["trial_id"] in all_trials_by_subj[row["subject"]]:
1266
- if selected_trial["Fixation Cleaning Stats"]["Discard fixation before or after blinks"]:
1267
- trial.at[index, "blink"] = selected_trial["Fixation Cleaning Stats"][
1268
- "Number of discarded fixations due to blinks"
1269
- ]
1270
- for key, value in selected_trial.items():
 
 
1271
  if key in info_keys:
1272
  trial.at[index, key] = value
1273
 
@@ -1349,10 +1359,45 @@ def aggregate_trials(dffix_combined, wordcomb, all_trials_by_subj, algo_choices)
1349
 
1350
 
1351
  def aggregate_subjects(trials, algo_choices):
1352
- trial_aggregates = trials.groupby("subject")[["nfix", "blink"]].mean().round(3).reset_index()
1353
- trial_aggregates = trial_aggregates.merge(
1354
- trials.groupby("subject")["question_correct"].sum().reset_index(name="n_question_correct"), on="subject"
1355
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1356
  trial_aggregates = trial_aggregates.merge(
1357
  trials.groupby("subject")["trial_id"].count().reset_index(name="ntrial"), on="subject"
1358
  )
@@ -1371,7 +1416,8 @@ def aggregate_subjects(trials, algo_choices):
1371
  ]
1372
  if c in trials.columns
1373
  ]
1374
- trial_aggregates_temp = trials.groupby("subject")[cols_to_do].mean().round(3).reset_index()
1375
- trial_aggregates = pd.merge(trial_aggregates, trial_aggregates_temp, how="left", on="subject")
 
1376
 
1377
  return trial_aggregates
 
1258
  trial = trial[names].copy()
1259
 
1260
  for index, row in trial.iterrows():
1261
+ # Check if subject and trial exist in all_trials_by_subj
1262
+ if row["subject"] not in all_trials_by_subj:
1263
+ print(f"Warning: Subject '{row['subject']}' not found in all_trials_by_subj. Skipping trial {row['trial_id']}.")
1264
+ continue
1265
+ if row["trial_id"] not in all_trials_by_subj[row["subject"]]:
1266
+ print(f"Warning: Trial '{row['trial_id']}' not found for subject '{row['subject']}'. Skipping.")
1267
+ continue
1268
+
1269
  selected_trial = all_trials_by_subj[row["subject"]][row["trial_id"]]
1270
  info_keys = [
1271
  k for k in selected_trial.keys() if k in ["trial_start_time", "trial_end_time", "question_correct"]
1272
  ]
1273
+
1274
+ # Check if Fixation Cleaning Stats exists
1275
+ if "Fixation Cleaning Stats" in selected_trial:
1276
+ if selected_trial["Fixation Cleaning Stats"].get("Discard fixation before or after blinks", False):
1277
+ trial.at[index, "blink"] = selected_trial["Fixation Cleaning Stats"].get(
1278
+ "Number of discarded fixations due to blinks", 0
1279
+ )
1280
+ for key, value in selected_trial.items():
1281
  if key in info_keys:
1282
  trial.at[index, key] = value
1283
 
 
1359
 
1360
 
1361
  def aggregate_subjects(trials, algo_choices):
1362
+ base_cols = [col for col in ["nfix", "blink"] if col in trials.columns]
1363
+ if base_cols:
1364
+ trial_aggregates = trials.groupby("subject")[base_cols].mean().round(3).reset_index()
1365
+ else:
1366
+ trial_aggregates = trials[["subject"]].drop_duplicates().reset_index(drop=True)
1367
+
1368
+ if "question_correct" in trials.columns:
1369
+ qc_series = trials["question_correct"].copy()
1370
+ if qc_series.dtype == "object":
1371
+ qc_series = qc_series.replace(
1372
+ {
1373
+ "True": True,
1374
+ "true": True,
1375
+ "FALSE": False,
1376
+ "False": False,
1377
+ "false": False,
1378
+ "TRUE": True,
1379
+ "": pd.NA,
1380
+ None: pd.NA,
1381
+ }
1382
+ )
1383
+ try:
1384
+ qc_boolean = qc_series.astype("boolean")
1385
+ except (TypeError, ValueError):
1386
+ qc_boolean = qc_series.apply(lambda x: bool(x) if pd.notna(x) else pd.NA).astype("boolean")
1387
+ qc_int = qc_boolean.astype("Int64")
1388
+ question_counts = (
1389
+ qc_int.groupby(trials["subject"])
1390
+ .sum(min_count=0)
1391
+ .fillna(0)
1392
+ .astype("Int64")
1393
+ .rename("n_question_correct")
1394
+ .reset_index()
1395
+ )
1396
+ trial_aggregates = trial_aggregates.merge(question_counts, on="subject", how="left")
1397
+ trial_aggregates["n_question_correct"] = trial_aggregates["n_question_correct"].astype("Int64")
1398
+ else:
1399
+ trial_aggregates["n_question_correct"] = pd.Series(0, index=trial_aggregates.index, dtype="Int64")
1400
+
1401
  trial_aggregates = trial_aggregates.merge(
1402
  trials.groupby("subject")["trial_id"].count().reset_index(name="ntrial"), on="subject"
1403
  )
 
1416
  ]
1417
  if c in trials.columns
1418
  ]
1419
+ if cols_to_do:
1420
+ trial_aggregates_temp = trials.groupby("subject")[cols_to_do].mean().round(3).reset_index()
1421
+ trial_aggregates = pd.merge(trial_aggregates, trial_aggregates_temp, how="left", on="subject")
1422
 
1423
  return trial_aggregates
utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import pickle
2
  from io import StringIO
3
  import re
 
4
  from typing import Dict, List
5
  import zipfile
6
  import os
@@ -199,6 +200,9 @@ def calc_xdiff_ydiff(line_xcoords_no_pad, line_ycoords_no_pad, line_heights, all
199
 
200
  def add_words(chars_list):
201
 
 
 
 
202
  chars_list_reconstructed = []
203
  words_list = []
204
  sentence_list = []
@@ -222,8 +226,8 @@ def add_words(chars_list):
222
  next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
223
  if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
224
  return False
225
- # Optionally: only treat as boundary if followed by space or end
226
- if next_c and next_c not in [None, " ", "\n"]:
227
  return False
228
  return True
229
  for idx, char_dict in enumerate(chars_list):
@@ -253,8 +257,8 @@ def add_words(chars_list):
253
  word_xmax = chars_list_reconstructed[-1]["char_xmax"]
254
  word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
255
  word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
256
- word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin, ndigits=2)
257
- word_y_center = chars_list_reconstructed[word_start_idx]["char_y_center"]
258
  word_length = len(word)
259
  assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
260
  word_dict = dict(
@@ -325,12 +329,12 @@ def add_words(chars_list):
325
  word_ymax=words_list[-1]["word_ymax"],
326
  assigned_line=assigned_line,
327
  )
328
- word_x_center = round(
329
- (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
330
- )
331
- word_y_center = round(
332
- (words_list[-1]["word_ymax"] - word_dict["word_ymin"]) / 2 + words_list[-1]["word_ymin"], ndigits=2
333
- )
334
  words_list[-1]["word_x_center"] = word_x_center
335
  words_list[-1]["word_y_center"] = word_y_center
336
  else:
@@ -414,14 +418,14 @@ def read_ias_file(ias_file, prefix):
414
  if words_include_spaces:
415
  ias_df[f"{prefix}_length"] = ias_df[prefix].map(lambda x: len(x) + 1)
416
  ias_df[f"{prefix}_width_per_length"] = ias_df[f"{prefix}_width"] / ias_df[f"{prefix}_length"]
417
- ias_df[f"{prefix}_xmax"] = (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_width_per_length"]).round(2)
418
 
419
- ias_df[f"{prefix}_x_center"] = (
420
  (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]) / 2 + ias_df[f"{prefix}_xmin"]
421
- ).round(2)
422
- ias_df[f"{prefix}_y_center"] = (
423
  (ias_df[f"{prefix}_ymax"] - ias_df[f"{prefix}_ymin"]) / 2 + ias_df[f"{prefix}_ymin"]
424
- ).round(2)
425
  unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
426
  assigned_lines = [unique_midlines.index(x) for x in ias_df[f"{prefix}_y_center"]]
427
  ias_df["assigned_line"] = assigned_lines
@@ -440,41 +444,46 @@ def get_chars_list_from_words_list(ias_df, prefix="word"):
440
  char_dict = dict(
441
  in_word_number=idx,
442
  in_word=word,
443
- char_xmin=round(row[f"{prefix}_xmin"] + i_w * letter_width, 2),
444
- char_xmax=round(row[f"{prefix}_xmin"] + (i_w + 1) * letter_width, 2),
445
  char_ymin=row[f"{prefix}_ymin"],
446
  char_ymax=row[f"{prefix}_ymax"],
447
  char=letter,
448
  )
449
 
450
- char_dict["char_x_center"] = round(
451
- (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
452
- )
453
- char_dict["char_y_center"] = round(
454
- (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"], ndigits=2
455
- )
456
 
457
  if i_w >= len(word) + 1:
458
  break
459
- char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
 
 
 
460
  chars_list.append(char_dict)
461
  if chars_list[-1]["char"] != " " and row.assigned_line == next_row.assigned_line:
462
  char_dict = dict(
463
  char_xmin=chars_list[-1]["char_xmax"],
464
- char_xmax=round(chars_list[-1]["char_xmax"] + letter_width, 2),
465
  char_ymin=row[f"{prefix}_ymin"],
466
  char_ymax=row[f"{prefix}_ymax"],
467
  char=" ",
468
  )
469
 
470
- char_dict["char_x_center"] = round(
471
- (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
472
- )
473
- char_dict["char_y_center"] = round(
474
- (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"], ndigits=2
475
- )
476
-
477
- char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
 
 
478
  chars_list.append(char_dict)
479
  chars_df = pd.DataFrame(chars_list)
480
  chars_df.loc[:, ["in_word_number", "in_word"]] = chars_df.loc[:, ["in_word_number", "in_word"]].copy().ffill(axis=0)
@@ -736,10 +745,10 @@ def asc_lines_to_trials_by_trail_id(
736
  if "question_correct" in trials_df.columns:
737
  paragraph_trials_df = trials_df.loc[trials_df.trial_is == "paragraph", :]
738
  overall_question_answer_value_counts = (
739
- paragraph_trials_df["question_correct"].dropna().astype(int).value_counts().to_dict()
740
  )
741
  overall_question_answer_value_counts_normed = (
742
- paragraph_trials_df["question_correct"].dropna().astype(int).value_counts(normalize=True).to_dict()
743
  )
744
  else:
745
  overall_question_answer_value_counts = None
@@ -841,12 +850,12 @@ def asc_lines_to_trials_by_trail_id(
841
  "char_xmax": float(parts[rg_idx + 6 + idx_correction]),
842
  "char_ymax": float(parts[rg_idx + 7 + idx_correction]),
843
  }
844
- char_dict["char_y_center"] = round(
845
- (char_dict["char_ymax"] - char_dict["char_ymin"]) / 2 + char_dict["char_ymin"], ndigits=2
846
- )
847
- char_dict["char_x_center"] = round(
848
- (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
849
- )
850
  chars_list.append(char_dict)
851
  except Exception as e:
852
  ic(f"char_dict creation failed for parts {parts}")
@@ -884,7 +893,7 @@ def asc_lines_to_trials_by_trail_id(
884
  for idx in range(len(chars_list)):
885
  chars_list[idx]["char_y_center"] = round(
886
  (chars_list[idx]["char_ymax"] - chars_list[idx]["char_ymin"]) / 2 + chars_list[idx]["char_ymin"],
887
- ndigits=2,
888
  )
889
  if chars_list[idx]["char_y_center"] not in line_ycoords:
890
  line_ycoords.append(chars_list[idx]["char_y_center"])
 
1
  import pickle
2
  from io import StringIO
3
  import re
4
+ import traceback
5
  from typing import Dict, List
6
  import zipfile
7
  import os
 
200
 
201
  def add_words(chars_list):
202
 
203
+ if not chars_list:
204
+ return [], []
205
+
206
  chars_list_reconstructed = []
207
  words_list = []
208
  sentence_list = []
 
226
  next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
227
  if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
228
  return False
229
+ # Treat as boundary if followed by space, end, or an uppercase letter (for no-space cases like "one.Two")
230
+ if next_c and next_c not in [None, " ", "\n"] and not (next_c.isupper() and next_c.isalpha()):
231
  return False
232
  return True
233
  for idx, char_dict in enumerate(chars_list):
 
257
  word_xmax = chars_list_reconstructed[-1]["char_xmax"]
258
  word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
259
  word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
260
+ word_x_center = int(round((word_xmax - word_xmin) / 2 + word_xmin))
261
+ word_y_center = int(round((word_ymax - word_ymin) / 2 + word_ymin))
262
  word_length = len(word)
263
  assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
264
  word_dict = dict(
 
329
  word_ymax=words_list[-1]["word_ymax"],
330
  assigned_line=assigned_line,
331
  )
332
+ word_x_center = int(round(
333
+ (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"]
334
+ ))
335
+ word_y_center = int(round(
336
+ (words_list[-1]["word_ymax"] - word_dict["word_ymin"]) / 2 + words_list[-1]["word_ymin"]
337
+ ))
338
  words_list[-1]["word_x_center"] = word_x_center
339
  words_list[-1]["word_y_center"] = word_y_center
340
  else:
 
418
  if words_include_spaces:
419
  ias_df[f"{prefix}_length"] = ias_df[prefix].map(lambda x: len(x) + 1)
420
  ias_df[f"{prefix}_width_per_length"] = ias_df[f"{prefix}_width"] / ias_df[f"{prefix}_length"]
421
+ ias_df[f"{prefix}_xmax"] = [int(x) for x in (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_width_per_length"]).round(0)]
422
 
423
+ ias_df[f"{prefix}_x_center"] = [int(x) for x in (
424
  (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]) / 2 + ias_df[f"{prefix}_xmin"]
425
+ ).round(0)]
426
+ ias_df[f"{prefix}_y_center"] = [int(x) for x in (
427
  (ias_df[f"{prefix}_ymax"] - ias_df[f"{prefix}_ymin"]) / 2 + ias_df[f"{prefix}_ymin"]
428
+ ).round(0)]
429
  unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
430
  assigned_lines = [unique_midlines.index(x) for x in ias_df[f"{prefix}_y_center"]]
431
  ias_df["assigned_line"] = assigned_lines
 
444
  char_dict = dict(
445
  in_word_number=idx,
446
  in_word=word,
447
+ char_xmin=round(row[f"{prefix}_xmin"] + i_w * letter_width, 0),
448
+ char_xmax=round(row[f"{prefix}_xmin"] + (i_w + 1) * letter_width, 0),
449
  char_ymin=row[f"{prefix}_ymin"],
450
  char_ymax=row[f"{prefix}_ymax"],
451
  char=letter,
452
  )
453
 
454
+ char_dict["char_x_center"] = int(round(
455
+ (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
456
+ ))
457
+ char_dict["char_y_center"] = int(round(
458
+ (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
459
+ ))
460
 
461
  if i_w >= len(word) + 1:
462
  break
463
+ try:
464
+ char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
465
+ except Exception as e:
466
+ ic(f"Error assigning line for char: {e}\n{traceback.format_exc()}")
467
  chars_list.append(char_dict)
468
  if chars_list[-1]["char"] != " " and row.assigned_line == next_row.assigned_line:
469
  char_dict = dict(
470
  char_xmin=chars_list[-1]["char_xmax"],
471
+ char_xmax=round(chars_list[-1]["char_xmax"] + letter_width, 0),
472
  char_ymin=row[f"{prefix}_ymin"],
473
  char_ymax=row[f"{prefix}_ymax"],
474
  char=" ",
475
  )
476
 
477
+ char_dict["char_x_center"] = int(round(
478
+ (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
479
+ ))
480
+ char_dict["char_y_center"] = int(round(
481
+ (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
482
+ ))
483
+ try:
484
+ char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
485
+ except Exception as e:
486
+ ic(f"Error assigning line for space char: {e}\n{traceback.format_exc()}")
487
  chars_list.append(char_dict)
488
  chars_df = pd.DataFrame(chars_list)
489
  chars_df.loc[:, ["in_word_number", "in_word"]] = chars_df.loc[:, ["in_word_number", "in_word"]].copy().ffill(axis=0)
 
745
  if "question_correct" in trials_df.columns:
746
  paragraph_trials_df = trials_df.loc[trials_df.trial_is == "paragraph", :]
747
  overall_question_answer_value_counts = (
748
+ paragraph_trials_df["question_correct"].dropna().map(int).value_counts().to_dict()
749
  )
750
  overall_question_answer_value_counts_normed = (
751
+ paragraph_trials_df["question_correct"].dropna().map(int).value_counts(normalize=True).to_dict()
752
  )
753
  else:
754
  overall_question_answer_value_counts = None
 
850
  "char_xmax": float(parts[rg_idx + 6 + idx_correction]),
851
  "char_ymax": float(parts[rg_idx + 7 + idx_correction]),
852
  }
853
+ char_dict["char_y_center"] = int(round(
854
+ (char_dict["char_ymax"] - char_dict["char_ymin"]) / 2 + char_dict["char_ymin"]
855
+ ))
856
+ char_dict["char_x_center"] = int(round(
857
+ (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
858
+ ))
859
  chars_list.append(char_dict)
860
  except Exception as e:
861
  ic(f"char_dict creation failed for parts {parts}")
 
893
  for idx in range(len(chars_list)):
894
  chars_list[idx]["char_y_center"] = round(
895
  (chars_list[idx]["char_ymax"] - chars_list[idx]["char_ymin"]) / 2 + chars_list[idx]["char_ymin"],
896
+ ndigits=0,
897
  )
898
  if chars_list[idx]["char_y_center"] not in line_ycoords:
899
  line_ycoords.append(chars_list[idx]["char_y_center"])