sync with dev version
Browse files- analysis_funcs.py +44 -25
- app.py +0 -0
- classic_correction_algos.py +1 -1
- multi_proc_funcs.py +121 -45
- popEye_funcs.py +58 -12
- utils.py +51 -42
analysis_funcs.py
CHANGED
|
@@ -29,11 +29,19 @@ def fix_in_ia_default(fixation, ia_row, prefix):
|
|
| 29 |
)
|
| 30 |
|
| 31 |
|
| 32 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"""
|
| 34 |
Return the number of fixations on that interest area.
|
| 35 |
"""
|
| 36 |
-
ia_df =
|
| 37 |
counts = []
|
| 38 |
for cidx, ia_row in ia_df.iterrows():
|
| 39 |
count = 0
|
|
@@ -57,11 +65,11 @@ def number_of_fixations_own(trial, dffix, prefix, correction_algo):
|
|
| 57 |
return pd.DataFrame(counts)
|
| 58 |
|
| 59 |
|
| 60 |
-
def initial_fixation_duration_own(trial, dffix, prefix, correction_algo):
|
| 61 |
"""
|
| 62 |
The duration of the initial fixation on that interest area for each word.
|
| 63 |
"""
|
| 64 |
-
ia_df =
|
| 65 |
durations = []
|
| 66 |
|
| 67 |
for cidx, ia_row in ia_df.iterrows():
|
|
@@ -81,8 +89,8 @@ def initial_fixation_duration_own(trial, dffix, prefix, correction_algo):
|
|
| 81 |
return pd.DataFrame(durations)
|
| 82 |
|
| 83 |
|
| 84 |
-
def first_of_many_duration_own(trial, dffix, prefix, correction_algo):
|
| 85 |
-
ia_df =
|
| 86 |
durations = []
|
| 87 |
for cidx, ia_row in ia_df.iterrows():
|
| 88 |
fixation_durations = []
|
|
@@ -111,11 +119,11 @@ def first_of_many_duration_own(trial, dffix, prefix, correction_algo):
|
|
| 111 |
return pd.DataFrame()
|
| 112 |
|
| 113 |
|
| 114 |
-
def total_fixation_duration_own(trial, dffix, prefix, correction_algo):
|
| 115 |
"""
|
| 116 |
sum duration of all fixations on that interest area.
|
| 117 |
"""
|
| 118 |
-
ia_df =
|
| 119 |
durations = []
|
| 120 |
for cidx, ia_row in ia_df.iterrows():
|
| 121 |
total_duration = 0
|
|
@@ -132,12 +140,12 @@ def total_fixation_duration_own(trial, dffix, prefix, correction_algo):
|
|
| 132 |
return pd.DataFrame(durations)
|
| 133 |
|
| 134 |
|
| 135 |
-
def gaze_duration_own(trial, dffix, prefix, correction_algo):
|
| 136 |
"""
|
| 137 |
Gaze duration is the sum duration of all fixations
|
| 138 |
inside an interest area until the area is exited for the first time.
|
| 139 |
"""
|
| 140 |
-
ia_df =
|
| 141 |
durations = []
|
| 142 |
for cidx, ia_row in ia_df.iterrows():
|
| 143 |
duration = 0
|
|
@@ -158,7 +166,7 @@ def gaze_duration_own(trial, dffix, prefix, correction_algo):
|
|
| 158 |
return pd.DataFrame(durations)
|
| 159 |
|
| 160 |
|
| 161 |
-
def go_past_duration_own(trial, dffix, prefix, correction_algo):
|
| 162 |
"""
|
| 163 |
Given an interest area and fixation sequence, return the go-past time on
|
| 164 |
that interest area. Go-past time is the sum duration of all fixations from
|
|
@@ -166,7 +174,7 @@ def go_past_duration_own(trial, dffix, prefix, correction_algo):
|
|
| 166 |
the right, including any regressions to the left that occur during that
|
| 167 |
time period (and vice versa in the case of right-to-left text).
|
| 168 |
"""
|
| 169 |
-
ia_df =
|
| 170 |
results = []
|
| 171 |
|
| 172 |
for cidx, ia_row in ia_df.iterrows():
|
|
@@ -190,12 +198,12 @@ def go_past_duration_own(trial, dffix, prefix, correction_algo):
|
|
| 190 |
return pd.DataFrame(results)
|
| 191 |
|
| 192 |
|
| 193 |
-
def second_pass_duration_own(trial, dffix, prefix, correction_algo):
|
| 194 |
"""
|
| 195 |
Given an interest area and fixation sequence, return the second pass
|
| 196 |
duration on that interest area for each word.
|
| 197 |
"""
|
| 198 |
-
ia_df =
|
| 199 |
durations = []
|
| 200 |
|
| 201 |
for cidx, ia_row in ia_df.iterrows():
|
|
@@ -224,13 +232,13 @@ def second_pass_duration_own(trial, dffix, prefix, correction_algo):
|
|
| 224 |
return pd.DataFrame(durations)
|
| 225 |
|
| 226 |
|
| 227 |
-
def initial_landing_position_own(trial, dffix, prefix, correction_algo):
|
| 228 |
"""
|
| 229 |
initial landing position (expressed in character positions) on that interest area.
|
| 230 |
Counting is from 1. Returns `None` if no fixation
|
| 231 |
landed on the interest area.
|
| 232 |
"""
|
| 233 |
-
ia_df =
|
| 234 |
if prefix == "word":
|
| 235 |
chars_df = pd.DataFrame(trial[f"chars_list"])
|
| 236 |
else:
|
|
@@ -266,7 +274,7 @@ def initial_landing_position_own(trial, dffix, prefix, correction_algo):
|
|
| 266 |
return pd.DataFrame(results)
|
| 267 |
|
| 268 |
|
| 269 |
-
def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
|
| 270 |
"""
|
| 271 |
Given an interest area and fixation sequence, return the initial landing
|
| 272 |
distance on that interest area. The initial landing distance is the pixel
|
|
@@ -276,7 +284,7 @@ def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
|
|
| 276 |
without including any padding. Returns `None` if no fixation landed on the
|
| 277 |
interest area.
|
| 278 |
"""
|
| 279 |
-
ia_df =
|
| 280 |
distances = []
|
| 281 |
for cidx, ia_row in ia_df.iterrows():
|
| 282 |
initial_distance = None
|
|
@@ -296,12 +304,12 @@ def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
|
|
| 296 |
return pd.DataFrame(distances)
|
| 297 |
|
| 298 |
|
| 299 |
-
def landing_distances_own(trial, dffix, prefix, correction_algo):
|
| 300 |
"""
|
| 301 |
Given an interest area and fixation sequence, return a dataframe with
|
| 302 |
landing distances for each word in the interest area.
|
| 303 |
"""
|
| 304 |
-
ia_df =
|
| 305 |
distances = []
|
| 306 |
for cidx, ia_row in ia_df.iterrows():
|
| 307 |
landing_distances = []
|
|
@@ -319,11 +327,15 @@ def landing_distances_own(trial, dffix, prefix, correction_algo):
|
|
| 319 |
return pd.DataFrame(distances)
|
| 320 |
|
| 321 |
|
| 322 |
-
def number_of_regressions_in_own(trial, dffix, prefix, correction_algo):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
word_reg_in_count = (
|
| 324 |
-
dffix.groupby([f"on_{prefix}_number_{correction_algo}", f"on_{prefix}_{correction_algo}"])
|
| 325 |
-
|
| 326 |
-
]
|
| 327 |
.sum()
|
| 328 |
.reset_index()
|
| 329 |
.rename(
|
|
@@ -335,4 +347,11 @@ def number_of_regressions_in_own(trial, dffix, prefix, correction_algo):
|
|
| 335 |
)
|
| 336 |
)
|
| 337 |
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
|
| 32 |
+
def _get_interest_area_df(trial, prefix, ia_df=None):
|
| 33 |
+
"""Return the interest-area dataframe, reusing a provided instance when available."""
|
| 34 |
+
|
| 35 |
+
if ia_df is not None:
|
| 36 |
+
return ia_df
|
| 37 |
+
return pd.DataFrame(trial[f"{prefix}s_list"])
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def number_of_fixations_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 41 |
"""
|
| 42 |
Return the number of fixations on that interest area.
|
| 43 |
"""
|
| 44 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 45 |
counts = []
|
| 46 |
for cidx, ia_row in ia_df.iterrows():
|
| 47 |
count = 0
|
|
|
|
| 65 |
return pd.DataFrame(counts)
|
| 66 |
|
| 67 |
|
| 68 |
+
def initial_fixation_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 69 |
"""
|
| 70 |
The duration of the initial fixation on that interest area for each word.
|
| 71 |
"""
|
| 72 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 73 |
durations = []
|
| 74 |
|
| 75 |
for cidx, ia_row in ia_df.iterrows():
|
|
|
|
| 89 |
return pd.DataFrame(durations)
|
| 90 |
|
| 91 |
|
| 92 |
+
def first_of_many_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 93 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 94 |
durations = []
|
| 95 |
for cidx, ia_row in ia_df.iterrows():
|
| 96 |
fixation_durations = []
|
|
|
|
| 119 |
return pd.DataFrame()
|
| 120 |
|
| 121 |
|
| 122 |
+
def total_fixation_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 123 |
"""
|
| 124 |
sum duration of all fixations on that interest area.
|
| 125 |
"""
|
| 126 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 127 |
durations = []
|
| 128 |
for cidx, ia_row in ia_df.iterrows():
|
| 129 |
total_duration = 0
|
|
|
|
| 140 |
return pd.DataFrame(durations)
|
| 141 |
|
| 142 |
|
| 143 |
+
def gaze_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 144 |
"""
|
| 145 |
Gaze duration is the sum duration of all fixations
|
| 146 |
inside an interest area until the area is exited for the first time.
|
| 147 |
"""
|
| 148 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 149 |
durations = []
|
| 150 |
for cidx, ia_row in ia_df.iterrows():
|
| 151 |
duration = 0
|
|
|
|
| 166 |
return pd.DataFrame(durations)
|
| 167 |
|
| 168 |
|
| 169 |
+
def go_past_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 170 |
"""
|
| 171 |
Given an interest area and fixation sequence, return the go-past time on
|
| 172 |
that interest area. Go-past time is the sum duration of all fixations from
|
|
|
|
| 174 |
the right, including any regressions to the left that occur during that
|
| 175 |
time period (and vice versa in the case of right-to-left text).
|
| 176 |
"""
|
| 177 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 178 |
results = []
|
| 179 |
|
| 180 |
for cidx, ia_row in ia_df.iterrows():
|
|
|
|
| 198 |
return pd.DataFrame(results)
|
| 199 |
|
| 200 |
|
| 201 |
+
def second_pass_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 202 |
"""
|
| 203 |
Given an interest area and fixation sequence, return the second pass
|
| 204 |
duration on that interest area for each word.
|
| 205 |
"""
|
| 206 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 207 |
durations = []
|
| 208 |
|
| 209 |
for cidx, ia_row in ia_df.iterrows():
|
|
|
|
| 232 |
return pd.DataFrame(durations)
|
| 233 |
|
| 234 |
|
| 235 |
+
def initial_landing_position_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 236 |
"""
|
| 237 |
initial landing position (expressed in character positions) on that interest area.
|
| 238 |
Counting is from 1. Returns `None` if no fixation
|
| 239 |
landed on the interest area.
|
| 240 |
"""
|
| 241 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 242 |
if prefix == "word":
|
| 243 |
chars_df = pd.DataFrame(trial[f"chars_list"])
|
| 244 |
else:
|
|
|
|
| 274 |
return pd.DataFrame(results)
|
| 275 |
|
| 276 |
|
| 277 |
+
def initial_landing_distance_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 278 |
"""
|
| 279 |
Given an interest area and fixation sequence, return the initial landing
|
| 280 |
distance on that interest area. The initial landing distance is the pixel
|
|
|
|
| 284 |
without including any padding. Returns `None` if no fixation landed on the
|
| 285 |
interest area.
|
| 286 |
"""
|
| 287 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 288 |
distances = []
|
| 289 |
for cidx, ia_row in ia_df.iterrows():
|
| 290 |
initial_distance = None
|
|
|
|
| 304 |
return pd.DataFrame(distances)
|
| 305 |
|
| 306 |
|
| 307 |
+
def landing_distances_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 308 |
"""
|
| 309 |
Given an interest area and fixation sequence, return a dataframe with
|
| 310 |
landing distances for each word in the interest area.
|
| 311 |
"""
|
| 312 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 313 |
distances = []
|
| 314 |
for cidx, ia_row in ia_df.iterrows():
|
| 315 |
landing_distances = []
|
|
|
|
| 327 |
return pd.DataFrame(distances)
|
| 328 |
|
| 329 |
|
| 330 |
+
def number_of_regressions_in_own(trial, dffix, prefix, correction_algo, ia_df=None):
|
| 331 |
+
ia_df = _get_interest_area_df(trial, prefix, ia_df)
|
| 332 |
+
|
| 333 |
+
if f"{prefix}_number" not in ia_df.columns:
|
| 334 |
+
ia_df = ia_df.reset_index().rename(columns={"index": f"{prefix}_number"})
|
| 335 |
+
|
| 336 |
word_reg_in_count = (
|
| 337 |
+
dffix.groupby([f"on_{prefix}_number_{correction_algo}", f"on_{prefix}_{correction_algo}"])
|
| 338 |
+
[f"{prefix}_reg_in_{correction_algo}"]
|
|
|
|
| 339 |
.sum()
|
| 340 |
.reset_index()
|
| 341 |
.rename(
|
|
|
|
| 347 |
)
|
| 348 |
)
|
| 349 |
|
| 350 |
+
merged = ia_df[[f"{prefix}_number", prefix]].merge(
|
| 351 |
+
word_reg_in_count, how="left", on=[f"{prefix}_number", prefix]
|
| 352 |
+
)
|
| 353 |
+
merged[f"number_of_regressions_in_{correction_algo}"] = (
|
| 354 |
+
merged[f"number_of_regressions_in_{correction_algo}"].fillna(0).astype(int).tolist()
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
return merged
|
app.py
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
classic_correction_algos.py
CHANGED
|
@@ -58,7 +58,7 @@ def apply_classic_algo(
|
|
| 58 |
corrected_fix_y_vals = compare(fixation_array, np.array(word_center_list), **algo_params)
|
| 59 |
else:
|
| 60 |
raise NotImplementedError(f"{algo} not implemented")
|
| 61 |
-
corrected_fix_y_vals = np.round(corrected_fix_y_vals, decimals=
|
| 62 |
corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_fix_y_vals]
|
| 63 |
dffix[f"y_{algo}"] = corrected_fix_y_vals
|
| 64 |
dffix[f"line_num_{algo}"] = corrected_line_nums
|
|
|
|
| 58 |
corrected_fix_y_vals = compare(fixation_array, np.array(word_center_list), **algo_params)
|
| 59 |
else:
|
| 60 |
raise NotImplementedError(f"{algo} not implemented")
|
| 61 |
+
corrected_fix_y_vals = np.round(corrected_fix_y_vals, decimals=0).astype(int).tolist()
|
| 62 |
corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_fix_y_vals]
|
| 63 |
dffix[f"y_{algo}"] = corrected_fix_y_vals
|
| 64 |
dffix[f"line_num_{algo}"] = corrected_line_nums
|
multi_proc_funcs.py
CHANGED
|
@@ -16,6 +16,7 @@ from matplotlib.font_manager import FontProperties
|
|
| 16 |
from matplotlib.patches import Rectangle
|
| 17 |
from tqdm.auto import tqdm
|
| 18 |
import torch as t
|
|
|
|
| 19 |
t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
|
| 20 |
import plotly.express as px
|
| 21 |
import copy
|
|
@@ -31,6 +32,29 @@ torch.multiprocessing.set_sharing_strategy('file_system') # Needed to make multi
|
|
| 31 |
|
| 32 |
ic.configureOutput(includeContext=True)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
PLOTS_FOLDER = pl.Path("plots")
|
| 35 |
event_strs = [
|
| 36 |
"EFIX",
|
|
@@ -970,7 +994,7 @@ def get_raw_events_df_and_trial(trial, discard_fixations_without_sfix):
|
|
| 970 |
|
| 971 |
if "index" not in chars_df.columns:
|
| 972 |
chars_df.reset_index(inplace=True)
|
| 973 |
-
trial["chars_df"] = chars_df
|
| 974 |
trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
|
| 975 |
return reorder_columns(events_df), trial
|
| 976 |
|
|
@@ -1429,7 +1453,7 @@ def get_all_measures(
|
|
| 1429 |
for measure in measures_to_calculate:
|
| 1430 |
if hasattr(anf, f"{measure}_own"):
|
| 1431 |
function = getattr(anf, f"{measure}_own")
|
| 1432 |
-
result = function(trial, dffix_copy, prefix, correction_algo)
|
| 1433 |
res_dfs.append(result)
|
| 1434 |
dfs_list = [df for df in [stim_df] + res_dfs if not df.empty]
|
| 1435 |
own_measure_df = stim_df
|
|
@@ -1478,6 +1502,12 @@ def get_all_measures(
|
|
| 1478 |
own_measure_df.to_csv(
|
| 1479 |
RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_{correction_algo}_word_measures.csv"
|
| 1480 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1481 |
return own_measure_df
|
| 1482 |
|
| 1483 |
|
|
@@ -1838,7 +1868,7 @@ def get_DIST_preds(dffix, trial, models_dict):
|
|
| 1838 |
y_pred_DIST = [y_char_unique[idx] for idx in preds]
|
| 1839 |
|
| 1840 |
dffix[f"line_num_{algo_choice}"] = preds
|
| 1841 |
-
dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=
|
| 1842 |
dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(2)
|
| 1843 |
except Exception as e:
|
| 1844 |
ic(f"Exception on model(batch) for DIST \n{e}")
|
|
@@ -1868,7 +1898,7 @@ def get_DIST_ensemble_preds(
|
|
| 1868 |
y_pred_DIST = [y_char_unique[idx] for idx in preds]
|
| 1869 |
|
| 1870 |
dffix[f"line_num_{algo_choice}"] = preds
|
| 1871 |
-
dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=
|
| 1872 |
dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
|
| 1873 |
return dffix
|
| 1874 |
|
|
@@ -1989,7 +2019,7 @@ def add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, xcol, cols_to_
|
|
| 1989 |
dffix["angle_outgoing"] = angle_outgoing
|
| 1990 |
dffix[f"line_change_{algo_choice}"] = np.concatenate(
|
| 1991 |
([0], np.diff(dffix[f"line_num_{algo_choice}"])), axis=0
|
| 1992 |
-
).astype(int)
|
| 1993 |
|
| 1994 |
for i in list(dffix.index):
|
| 1995 |
if dffix.loc[i, f"line_num_{algo_choice}"] > -1 and not pd.isna(dffix.loc[i, f"line_num_{algo_choice}"]):
|
|
@@ -2161,10 +2191,20 @@ def correct_df(
|
|
| 2161 |
algo_choices = [algo_choice]
|
| 2162 |
repeats = range(1)
|
| 2163 |
|
| 2164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2165 |
if for_multi:
|
| 2166 |
own_word_measures_dfs_for_algo = []
|
| 2167 |
-
|
| 2168 |
trial["average_y_corrections"] = []
|
| 2169 |
for algoIdx in stqdm(repeats, desc="Applying line-assignment algorithms"):
|
| 2170 |
algo_choice = algo_choices[algoIdx]
|
|
@@ -2180,7 +2220,11 @@ def correct_df(
|
|
| 2180 |
fix_to_plot=["Uncorrected Fixations", "Corrected Fixations"],
|
| 2181 |
stim_info_to_plot=["Characters", "Word boxes"],
|
| 2182 |
)
|
| 2183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2184 |
fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300)
|
| 2185 |
plt.close(fig)
|
| 2186 |
dffix = add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, "x", cols_to_add=fix_cols_to_add)
|
|
@@ -2195,15 +2239,16 @@ def correct_df(
|
|
| 2195 |
measures_to_calculate=measures_to_calculate_multi_asc,
|
| 2196 |
include_coords=include_coords_multi_asc,
|
| 2197 |
)
|
|
|
|
|
|
|
| 2198 |
own_word_measures_dfs_for_algo.append(own_word_measures)
|
| 2199 |
-
|
| 2200 |
-
dffix,
|
| 2201 |
-
|
| 2202 |
-
own_sentence_measures_dfs_for_algo.append(sent_measures_multi)
|
| 2203 |
|
| 2204 |
if for_multi and len(own_word_measures_dfs_for_algo) > 0:
|
| 2205 |
words_df = (
|
| 2206 |
-
|
| 2207 |
.drop_duplicates(subset="in_word_number", keep="first")
|
| 2208 |
.loc[:, ["in_word_number", "in_word"]]
|
| 2209 |
.rename({"in_word_number": "word_number", "in_word": "word"}, axis=1)
|
|
@@ -2222,41 +2267,53 @@ def correct_df(
|
|
| 2222 |
)
|
| 2223 |
words_df = reorder_columns(words_df, ["subject", "trial_id", "item", "condition", "word_number", "word"])
|
| 2224 |
|
| 2225 |
-
sentence_df =
|
| 2226 |
-
|
| 2227 |
-
|
| 2228 |
-
|
| 2229 |
-
|
| 2230 |
-
[
|
| 2231 |
-
|
| 2232 |
-
|
| 2233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2234 |
]
|
| 2235 |
-
.rename({"in_sentence_number": "sentence_number", "in_sentence": "sentence"}, axis=1)
|
| 2236 |
-
.reset_index(drop=True)
|
| 2237 |
-
)
|
| 2238 |
-
add_cols_from_trial(trial, sentence_df, cols=["item", "condition", "trial_id", "subject"])
|
| 2239 |
-
sentence_df["subject_trialID"] = [
|
| 2240 |
-
f"{id}_{num}" for id, num in zip(sentence_df["subject"], sentence_df["trial_id"])
|
| 2241 |
-
]
|
| 2242 |
-
sentence_df = sentence_df.merge(
|
| 2243 |
-
own_sentence_measures_dfs_for_algo[0],
|
| 2244 |
-
how="left",
|
| 2245 |
-
on=["item", "condition", "trial_id", "subject", "sentence_number", "sentence"],
|
| 2246 |
-
)
|
| 2247 |
-
for sent_measure_df in own_sentence_measures_dfs_for_algo[1:]:
|
| 2248 |
sentence_df = sentence_df.merge(
|
| 2249 |
-
|
| 2250 |
how="left",
|
| 2251 |
-
on=["
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2252 |
)
|
| 2253 |
-
sentence_df = reorder_columns(
|
| 2254 |
-
sentence_df, ["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"]
|
| 2255 |
-
)
|
| 2256 |
|
| 2257 |
trial["own_word_measures_dfs_for_algo"] = words_df
|
| 2258 |
|
| 2259 |
-
|
|
|
|
| 2260 |
dffix = reorder_columns(dffix)
|
| 2261 |
if for_multi:
|
| 2262 |
return dffix
|
|
@@ -2313,10 +2370,25 @@ def process_trial_choice(
|
|
| 2313 |
merge_distance_threshold=merge_distance_threshold,
|
| 2314 |
discard_blinks=discard_blinks,
|
| 2315 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2316 |
if "chars_list" in trial:
|
| 2317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2318 |
|
| 2319 |
-
trial["chars_df"] = chars_df
|
| 2320 |
trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
|
| 2321 |
if algo_choice is not None and ("chars_list" in trial or "words_list" in trial):
|
| 2322 |
if dffix.shape[0] > 1:
|
|
@@ -2335,10 +2407,14 @@ def process_trial_choice(
|
|
| 2335 |
)
|
| 2336 |
|
| 2337 |
saccade_df = get_saccade_df(dffix, trial, algo_choice, trial.pop("events_df"))
|
| 2338 |
-
trial["saccade_df"] = saccade_df
|
| 2339 |
|
| 2340 |
fig = plot_saccade_df(dffix, saccade_df, trial, True, False)
|
| 2341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2342 |
plt.close(fig)
|
| 2343 |
else:
|
| 2344 |
ic(
|
|
|
|
| 16 |
from matplotlib.patches import Rectangle
|
| 17 |
from tqdm.auto import tqdm
|
| 18 |
import torch as t
|
| 19 |
+
import logging
|
| 20 |
t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
|
| 21 |
import plotly.express as px
|
| 22 |
import copy
|
|
|
|
| 32 |
|
| 33 |
ic.configureOutput(includeContext=True)
|
| 34 |
|
| 35 |
+
LOGGER = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
def ensure_dataframe(obj, *, label="data", context: str | None = None):
|
| 38 |
+
if isinstance(obj, pd.DataFrame):
|
| 39 |
+
return obj
|
| 40 |
+
if obj is None:
|
| 41 |
+
parts = [f"{label} is missing"]
|
| 42 |
+
if context:
|
| 43 |
+
parts.append(f"({context})")
|
| 44 |
+
message = " ".join(parts) + "."
|
| 45 |
+
LOGGER.error(message)
|
| 46 |
+
raise ValueError(message)
|
| 47 |
+
return pd.DataFrame(obj)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _make_hashable(value):
|
| 51 |
+
if isinstance(value, list):
|
| 52 |
+
return tuple(_make_hashable(v) for v in value)
|
| 53 |
+
if isinstance(value, dict):
|
| 54 |
+
return tuple(sorted((k, _make_hashable(v)) for k, v in value.items()))
|
| 55 |
+
return value
|
| 56 |
+
|
| 57 |
+
|
| 58 |
PLOTS_FOLDER = pl.Path("plots")
|
| 59 |
event_strs = [
|
| 60 |
"EFIX",
|
|
|
|
| 994 |
|
| 995 |
if "index" not in chars_df.columns:
|
| 996 |
chars_df.reset_index(inplace=True)
|
| 997 |
+
trial["chars_df"] = chars_df
|
| 998 |
trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
|
| 999 |
return reorder_columns(events_df), trial
|
| 1000 |
|
|
|
|
| 1453 |
for measure in measures_to_calculate:
|
| 1454 |
if hasattr(anf, f"{measure}_own"):
|
| 1455 |
function = getattr(anf, f"{measure}_own")
|
| 1456 |
+
result = function(trial, dffix_copy, prefix, correction_algo, stim_df)
|
| 1457 |
res_dfs.append(result)
|
| 1458 |
dfs_list = [df for df in [stim_df] + res_dfs if not df.empty]
|
| 1459 |
own_measure_df = stim_df
|
|
|
|
| 1502 |
own_measure_df.to_csv(
|
| 1503 |
RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_{correction_algo}_word_measures.csv"
|
| 1504 |
)
|
| 1505 |
+
|
| 1506 |
+
object_cols = own_measure_df.select_dtypes(include="object").columns
|
| 1507 |
+
if len(object_cols) > 0:
|
| 1508 |
+
for col in object_cols:
|
| 1509 |
+
own_measure_df[col] = own_measure_df[col].map(_make_hashable)
|
| 1510 |
+
|
| 1511 |
return own_measure_df
|
| 1512 |
|
| 1513 |
|
|
|
|
| 1868 |
y_pred_DIST = [y_char_unique[idx] for idx in preds]
|
| 1869 |
|
| 1870 |
dffix[f"line_num_{algo_choice}"] = preds
|
| 1871 |
+
dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=0).astype(int).tolist()
|
| 1872 |
dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(2)
|
| 1873 |
except Exception as e:
|
| 1874 |
ic(f"Exception on model(batch) for DIST \n{e}")
|
|
|
|
| 1898 |
y_pred_DIST = [y_char_unique[idx] for idx in preds]
|
| 1899 |
|
| 1900 |
dffix[f"line_num_{algo_choice}"] = preds
|
| 1901 |
+
dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=0).astype(int).tolist()
|
| 1902 |
dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
|
| 1903 |
return dffix
|
| 1904 |
|
|
|
|
| 2019 |
dffix["angle_outgoing"] = angle_outgoing
|
| 2020 |
dffix[f"line_change_{algo_choice}"] = np.concatenate(
|
| 2021 |
([0], np.diff(dffix[f"line_num_{algo_choice}"])), axis=0
|
| 2022 |
+
).astype(int).tolist()
|
| 2023 |
|
| 2024 |
for i in list(dffix.index):
|
| 2025 |
if dffix.loc[i, f"line_num_{algo_choice}"] > -1 and not pd.isna(dffix.loc[i, f"line_num_{algo_choice}"]):
|
|
|
|
| 2191 |
algo_choices = [algo_choice]
|
| 2192 |
repeats = range(1)
|
| 2193 |
|
| 2194 |
+
chars_df_source = trial.get("chars_df")
|
| 2195 |
+
if chars_df_source is None:
|
| 2196 |
+
chars_df_source = trial.get("chars_list")
|
| 2197 |
+
trial_context = f"trial {trial.get('trial_id', 'unknown')}"
|
| 2198 |
+
if trial.get("subject"):
|
| 2199 |
+
trial_context = f"{trial.get('subject')} / {trial_context}"
|
| 2200 |
+
chars_df = ensure_dataframe(
|
| 2201 |
+
chars_df_source,
|
| 2202 |
+
label="Character stimulus data",
|
| 2203 |
+
context=trial_context,
|
| 2204 |
+
)
|
| 2205 |
if for_multi:
|
| 2206 |
own_word_measures_dfs_for_algo = []
|
| 2207 |
+
own_sentence_measures_dfs_for_algo = []
|
| 2208 |
trial["average_y_corrections"] = []
|
| 2209 |
for algoIdx in stqdm(repeats, desc="Applying line-assignment algorithms"):
|
| 2210 |
algo_choice = algo_choices[algoIdx]
|
|
|
|
| 2220 |
fix_to_plot=["Uncorrected Fixations", "Corrected Fixations"],
|
| 2221 |
stim_info_to_plot=["Characters", "Word boxes"],
|
| 2222 |
)
|
| 2223 |
+
subject_for_name = trial.get("subject") or (
|
| 2224 |
+
pl.Path(trial["filename"]).stem if trial.get("filename") else "unknown_subject"
|
| 2225 |
+
)
|
| 2226 |
+
trial_id_for_name = trial.get("trial_id") or "unknown_trial"
|
| 2227 |
+
savename = f"{subject_for_name}_{trial_id_for_name}_corr_{algo_choice}_fix.png"
|
| 2228 |
fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300)
|
| 2229 |
plt.close(fig)
|
| 2230 |
dffix = add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, "x", cols_to_add=fix_cols_to_add)
|
|
|
|
| 2239 |
measures_to_calculate=measures_to_calculate_multi_asc,
|
| 2240 |
include_coords=include_coords_multi_asc,
|
| 2241 |
)
|
| 2242 |
+
if 'item' not in own_word_measures.columns:
|
| 2243 |
+
add_cols_from_trial(trial, own_word_measures, cols=["item", "condition", "trial_id", "subject"])
|
| 2244 |
own_word_measures_dfs_for_algo.append(own_word_measures)
|
| 2245 |
+
if sent_measures_to_calc_multi:
|
| 2246 |
+
sent_measures_multi = pf.compute_sentence_measures(dffix, chars_df, algo_choice, sent_measures_to_calc_multi)
|
| 2247 |
+
own_sentence_measures_dfs_for_algo.append(sent_measures_multi)
|
|
|
|
| 2248 |
|
| 2249 |
if for_multi and len(own_word_measures_dfs_for_algo) > 0:
|
| 2250 |
words_df = (
|
| 2251 |
+
chars_df
|
| 2252 |
.drop_duplicates(subset="in_word_number", keep="first")
|
| 2253 |
.loc[:, ["in_word_number", "in_word"]]
|
| 2254 |
.rename({"in_word_number": "word_number", "in_word": "word"}, axis=1)
|
|
|
|
| 2267 |
)
|
| 2268 |
words_df = reorder_columns(words_df, ["subject", "trial_id", "item", "condition", "word_number", "word"])
|
| 2269 |
|
| 2270 |
+
sentence_df = None
|
| 2271 |
+
if own_sentence_measures_dfs_for_algo:
|
| 2272 |
+
sentence_df = (
|
| 2273 |
+
chars_df
|
| 2274 |
+
.drop_duplicates(subset="in_sentence_number", keep="first")
|
| 2275 |
+
.loc[
|
| 2276 |
+
:,
|
| 2277 |
+
[
|
| 2278 |
+
"in_sentence_number",
|
| 2279 |
+
"in_sentence",
|
| 2280 |
+
],
|
| 2281 |
+
]
|
| 2282 |
+
.rename({"in_sentence_number": "sentence_number", "in_sentence": "sentence"}, axis=1)
|
| 2283 |
+
.reset_index(drop=True)
|
| 2284 |
+
)
|
| 2285 |
+
add_cols_from_trial(trial, sentence_df, cols=["item", "condition", "trial_id", "subject"])
|
| 2286 |
+
sentence_df["subject_trialID"] = [
|
| 2287 |
+
f"{id}_{num}" for id, num in zip(sentence_df["subject"], sentence_df["trial_id"])
|
| 2288 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2289 |
sentence_df = sentence_df.merge(
|
| 2290 |
+
own_sentence_measures_dfs_for_algo[0],
|
| 2291 |
how="left",
|
| 2292 |
+
on=["item", "condition", "trial_id", "subject", "sentence_number", "sentence"],
|
| 2293 |
+
)
|
| 2294 |
+
for sent_measure_df in own_sentence_measures_dfs_for_algo[1:]:
|
| 2295 |
+
sentence_df = sentence_df.merge(
|
| 2296 |
+
sent_measure_df,
|
| 2297 |
+
how="left",
|
| 2298 |
+
on=[
|
| 2299 |
+
"subject",
|
| 2300 |
+
"trial_id",
|
| 2301 |
+
"item",
|
| 2302 |
+
"condition",
|
| 2303 |
+
"sentence_number",
|
| 2304 |
+
"sentence",
|
| 2305 |
+
"number_of_words",
|
| 2306 |
+
],
|
| 2307 |
+
)
|
| 2308 |
+
sentence_df = reorder_columns(
|
| 2309 |
+
sentence_df,
|
| 2310 |
+
["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"],
|
| 2311 |
)
|
|
|
|
|
|
|
|
|
|
| 2312 |
|
| 2313 |
trial["own_word_measures_dfs_for_algo"] = words_df
|
| 2314 |
|
| 2315 |
+
if sentence_df is not None:
|
| 2316 |
+
trial["own_sentence_measures_dfs_for_algo"] = sentence_df
|
| 2317 |
dffix = reorder_columns(dffix)
|
| 2318 |
if for_multi:
|
| 2319 |
return dffix
|
|
|
|
| 2370 |
merge_distance_threshold=merge_distance_threshold,
|
| 2371 |
discard_blinks=discard_blinks,
|
| 2372 |
)
|
| 2373 |
+
if not trial.get("subject"):
|
| 2374 |
+
filename = trial.get("filename")
|
| 2375 |
+
if filename:
|
| 2376 |
+
trial["subject"] = pl.Path(filename).stem
|
| 2377 |
+
|
| 2378 |
if "chars_list" in trial:
|
| 2379 |
+
chars_df_source = trial.get("chars_df")
|
| 2380 |
+
if chars_df_source is None:
|
| 2381 |
+
chars_df_source = trial.get("chars_list")
|
| 2382 |
+
trial_context = f"trial {trial.get('trial_id', 'unknown')}"
|
| 2383 |
+
if trial.get("subject"):
|
| 2384 |
+
trial_context = f"{trial.get('subject')} / {trial_context}"
|
| 2385 |
+
chars_df = ensure_dataframe(
|
| 2386 |
+
chars_df_source,
|
| 2387 |
+
label="Character stimulus data",
|
| 2388 |
+
context=trial_context,
|
| 2389 |
+
)
|
| 2390 |
|
| 2391 |
+
trial["chars_df"] = chars_df
|
| 2392 |
trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
|
| 2393 |
if algo_choice is not None and ("chars_list" in trial or "words_list" in trial):
|
| 2394 |
if dffix.shape[0] > 1:
|
|
|
|
| 2407 |
)
|
| 2408 |
|
| 2409 |
saccade_df = get_saccade_df(dffix, trial, algo_choice, trial.pop("events_df"))
|
| 2410 |
+
trial["saccade_df"] = saccade_df
|
| 2411 |
|
| 2412 |
fig = plot_saccade_df(dffix, saccade_df, trial, True, False)
|
| 2413 |
+
subject_for_name = trial.get("subject") or (
|
| 2414 |
+
pl.Path(trial["filename"]).stem if trial.get("filename") else "unknown_subject"
|
| 2415 |
+
)
|
| 2416 |
+
trial_id_for_name = trial.get("trial_id") or "unknown_trial"
|
| 2417 |
+
fig.savefig(RESULTS_FOLDER / f"{subject_for_name}_{trial_id_for_name}_saccades.png")
|
| 2418 |
plt.close(fig)
|
| 2419 |
else:
|
| 2420 |
ic(
|
popEye_funcs.py
CHANGED
|
@@ -1258,16 +1258,26 @@ def aggregate_trials(dffix_combined, wordcomb, all_trials_by_subj, algo_choices)
|
|
| 1258 |
trial = trial[names].copy()
|
| 1259 |
|
| 1260 |
for index, row in trial.iterrows():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1261 |
selected_trial = all_trials_by_subj[row["subject"]][row["trial_id"]]
|
| 1262 |
info_keys = [
|
| 1263 |
k for k in selected_trial.keys() if k in ["trial_start_time", "trial_end_time", "question_correct"]
|
| 1264 |
]
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
|
| 1268 |
-
|
| 1269 |
-
]
|
| 1270 |
-
|
|
|
|
|
|
|
| 1271 |
if key in info_keys:
|
| 1272 |
trial.at[index, key] = value
|
| 1273 |
|
|
@@ -1349,10 +1359,45 @@ def aggregate_trials(dffix_combined, wordcomb, all_trials_by_subj, algo_choices)
|
|
| 1349 |
|
| 1350 |
|
| 1351 |
def aggregate_subjects(trials, algo_choices):
|
| 1352 |
-
|
| 1353 |
-
|
| 1354 |
-
trials.groupby("subject")[
|
| 1355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1356 |
trial_aggregates = trial_aggregates.merge(
|
| 1357 |
trials.groupby("subject")["trial_id"].count().reset_index(name="ntrial"), on="subject"
|
| 1358 |
)
|
|
@@ -1371,7 +1416,8 @@ def aggregate_subjects(trials, algo_choices):
|
|
| 1371 |
]
|
| 1372 |
if c in trials.columns
|
| 1373 |
]
|
| 1374 |
-
|
| 1375 |
-
|
|
|
|
| 1376 |
|
| 1377 |
return trial_aggregates
|
|
|
|
| 1258 |
trial = trial[names].copy()
|
| 1259 |
|
| 1260 |
for index, row in trial.iterrows():
|
| 1261 |
+
# Check if subject and trial exist in all_trials_by_subj
|
| 1262 |
+
if row["subject"] not in all_trials_by_subj:
|
| 1263 |
+
print(f"Warning: Subject '{row['subject']}' not found in all_trials_by_subj. Skipping trial {row['trial_id']}.")
|
| 1264 |
+
continue
|
| 1265 |
+
if row["trial_id"] not in all_trials_by_subj[row["subject"]]:
|
| 1266 |
+
print(f"Warning: Trial '{row['trial_id']}' not found for subject '{row['subject']}'. Skipping.")
|
| 1267 |
+
continue
|
| 1268 |
+
|
| 1269 |
selected_trial = all_trials_by_subj[row["subject"]][row["trial_id"]]
|
| 1270 |
info_keys = [
|
| 1271 |
k for k in selected_trial.keys() if k in ["trial_start_time", "trial_end_time", "question_correct"]
|
| 1272 |
]
|
| 1273 |
+
|
| 1274 |
+
# Check if Fixation Cleaning Stats exists
|
| 1275 |
+
if "Fixation Cleaning Stats" in selected_trial:
|
| 1276 |
+
if selected_trial["Fixation Cleaning Stats"].get("Discard fixation before or after blinks", False):
|
| 1277 |
+
trial.at[index, "blink"] = selected_trial["Fixation Cleaning Stats"].get(
|
| 1278 |
+
"Number of discarded fixations due to blinks", 0
|
| 1279 |
+
)
|
| 1280 |
+
for key, value in selected_trial.items():
|
| 1281 |
if key in info_keys:
|
| 1282 |
trial.at[index, key] = value
|
| 1283 |
|
|
|
|
| 1359 |
|
| 1360 |
|
| 1361 |
def aggregate_subjects(trials, algo_choices):
|
| 1362 |
+
base_cols = [col for col in ["nfix", "blink"] if col in trials.columns]
|
| 1363 |
+
if base_cols:
|
| 1364 |
+
trial_aggregates = trials.groupby("subject")[base_cols].mean().round(3).reset_index()
|
| 1365 |
+
else:
|
| 1366 |
+
trial_aggregates = trials[["subject"]].drop_duplicates().reset_index(drop=True)
|
| 1367 |
+
|
| 1368 |
+
if "question_correct" in trials.columns:
|
| 1369 |
+
qc_series = trials["question_correct"].copy()
|
| 1370 |
+
if qc_series.dtype == "object":
|
| 1371 |
+
qc_series = qc_series.replace(
|
| 1372 |
+
{
|
| 1373 |
+
"True": True,
|
| 1374 |
+
"true": True,
|
| 1375 |
+
"FALSE": False,
|
| 1376 |
+
"False": False,
|
| 1377 |
+
"false": False,
|
| 1378 |
+
"TRUE": True,
|
| 1379 |
+
"": pd.NA,
|
| 1380 |
+
None: pd.NA,
|
| 1381 |
+
}
|
| 1382 |
+
)
|
| 1383 |
+
try:
|
| 1384 |
+
qc_boolean = qc_series.astype("boolean")
|
| 1385 |
+
except (TypeError, ValueError):
|
| 1386 |
+
qc_boolean = qc_series.apply(lambda x: bool(x) if pd.notna(x) else pd.NA).astype("boolean")
|
| 1387 |
+
qc_int = qc_boolean.astype("Int64")
|
| 1388 |
+
question_counts = (
|
| 1389 |
+
qc_int.groupby(trials["subject"])
|
| 1390 |
+
.sum(min_count=0)
|
| 1391 |
+
.fillna(0)
|
| 1392 |
+
.astype("Int64")
|
| 1393 |
+
.rename("n_question_correct")
|
| 1394 |
+
.reset_index()
|
| 1395 |
+
)
|
| 1396 |
+
trial_aggregates = trial_aggregates.merge(question_counts, on="subject", how="left")
|
| 1397 |
+
trial_aggregates["n_question_correct"] = trial_aggregates["n_question_correct"].astype("Int64")
|
| 1398 |
+
else:
|
| 1399 |
+
trial_aggregates["n_question_correct"] = pd.Series(0, index=trial_aggregates.index, dtype="Int64")
|
| 1400 |
+
|
| 1401 |
trial_aggregates = trial_aggregates.merge(
|
| 1402 |
trials.groupby("subject")["trial_id"].count().reset_index(name="ntrial"), on="subject"
|
| 1403 |
)
|
|
|
|
| 1416 |
]
|
| 1417 |
if c in trials.columns
|
| 1418 |
]
|
| 1419 |
+
if cols_to_do:
|
| 1420 |
+
trial_aggregates_temp = trials.groupby("subject")[cols_to_do].mean().round(3).reset_index()
|
| 1421 |
+
trial_aggregates = pd.merge(trial_aggregates, trial_aggregates_temp, how="left", on="subject")
|
| 1422 |
|
| 1423 |
return trial_aggregates
|
utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import pickle
|
| 2 |
from io import StringIO
|
| 3 |
import re
|
|
|
|
| 4 |
from typing import Dict, List
|
| 5 |
import zipfile
|
| 6 |
import os
|
|
@@ -199,6 +200,9 @@ def calc_xdiff_ydiff(line_xcoords_no_pad, line_ycoords_no_pad, line_heights, all
|
|
| 199 |
|
| 200 |
def add_words(chars_list):
|
| 201 |
|
|
|
|
|
|
|
|
|
|
| 202 |
chars_list_reconstructed = []
|
| 203 |
words_list = []
|
| 204 |
sentence_list = []
|
|
@@ -222,8 +226,8 @@ def add_words(chars_list):
|
|
| 222 |
next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
|
| 223 |
if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
|
| 224 |
return False
|
| 225 |
-
#
|
| 226 |
-
if next_c and next_c not in [None, " ", "\n"]:
|
| 227 |
return False
|
| 228 |
return True
|
| 229 |
for idx, char_dict in enumerate(chars_list):
|
|
@@ -253,8 +257,8 @@ def add_words(chars_list):
|
|
| 253 |
word_xmax = chars_list_reconstructed[-1]["char_xmax"]
|
| 254 |
word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
|
| 255 |
word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
|
| 256 |
-
word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin
|
| 257 |
-
word_y_center =
|
| 258 |
word_length = len(word)
|
| 259 |
assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
|
| 260 |
word_dict = dict(
|
|
@@ -325,12 +329,12 @@ def add_words(chars_list):
|
|
| 325 |
word_ymax=words_list[-1]["word_ymax"],
|
| 326 |
assigned_line=assigned_line,
|
| 327 |
)
|
| 328 |
-
word_x_center = round(
|
| 329 |
-
(words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"]
|
| 330 |
-
)
|
| 331 |
-
word_y_center = round(
|
| 332 |
-
(words_list[-1]["word_ymax"] - word_dict["word_ymin"]) / 2 + words_list[-1]["word_ymin"]
|
| 333 |
-
)
|
| 334 |
words_list[-1]["word_x_center"] = word_x_center
|
| 335 |
words_list[-1]["word_y_center"] = word_y_center
|
| 336 |
else:
|
|
@@ -414,14 +418,14 @@ def read_ias_file(ias_file, prefix):
|
|
| 414 |
if words_include_spaces:
|
| 415 |
ias_df[f"{prefix}_length"] = ias_df[prefix].map(lambda x: len(x) + 1)
|
| 416 |
ias_df[f"{prefix}_width_per_length"] = ias_df[f"{prefix}_width"] / ias_df[f"{prefix}_length"]
|
| 417 |
-
ias_df[f"{prefix}_xmax"] = (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_width_per_length"]).round(
|
| 418 |
|
| 419 |
-
ias_df[f"{prefix}_x_center"] = (
|
| 420 |
(ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]) / 2 + ias_df[f"{prefix}_xmin"]
|
| 421 |
-
).round(
|
| 422 |
-
ias_df[f"{prefix}_y_center"] = (
|
| 423 |
(ias_df[f"{prefix}_ymax"] - ias_df[f"{prefix}_ymin"]) / 2 + ias_df[f"{prefix}_ymin"]
|
| 424 |
-
).round(
|
| 425 |
unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
|
| 426 |
assigned_lines = [unique_midlines.index(x) for x in ias_df[f"{prefix}_y_center"]]
|
| 427 |
ias_df["assigned_line"] = assigned_lines
|
|
@@ -440,41 +444,46 @@ def get_chars_list_from_words_list(ias_df, prefix="word"):
|
|
| 440 |
char_dict = dict(
|
| 441 |
in_word_number=idx,
|
| 442 |
in_word=word,
|
| 443 |
-
char_xmin=round(row[f"{prefix}_xmin"] + i_w * letter_width,
|
| 444 |
-
char_xmax=round(row[f"{prefix}_xmin"] + (i_w + 1) * letter_width,
|
| 445 |
char_ymin=row[f"{prefix}_ymin"],
|
| 446 |
char_ymax=row[f"{prefix}_ymax"],
|
| 447 |
char=letter,
|
| 448 |
)
|
| 449 |
|
| 450 |
-
char_dict["char_x_center"] = round(
|
| 451 |
-
(char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
|
| 452 |
-
)
|
| 453 |
-
char_dict["char_y_center"] = round(
|
| 454 |
-
(row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
|
| 455 |
-
)
|
| 456 |
|
| 457 |
if i_w >= len(word) + 1:
|
| 458 |
break
|
| 459 |
-
|
|
|
|
|
|
|
|
|
|
| 460 |
chars_list.append(char_dict)
|
| 461 |
if chars_list[-1]["char"] != " " and row.assigned_line == next_row.assigned_line:
|
| 462 |
char_dict = dict(
|
| 463 |
char_xmin=chars_list[-1]["char_xmax"],
|
| 464 |
-
char_xmax=round(chars_list[-1]["char_xmax"] + letter_width,
|
| 465 |
char_ymin=row[f"{prefix}_ymin"],
|
| 466 |
char_ymax=row[f"{prefix}_ymax"],
|
| 467 |
char=" ",
|
| 468 |
)
|
| 469 |
|
| 470 |
-
char_dict["char_x_center"] = round(
|
| 471 |
-
(char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
|
| 472 |
-
)
|
| 473 |
-
char_dict["char_y_center"] = round(
|
| 474 |
-
(row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
|
| 475 |
-
)
|
| 476 |
-
|
| 477 |
-
|
|
|
|
|
|
|
| 478 |
chars_list.append(char_dict)
|
| 479 |
chars_df = pd.DataFrame(chars_list)
|
| 480 |
chars_df.loc[:, ["in_word_number", "in_word"]] = chars_df.loc[:, ["in_word_number", "in_word"]].copy().ffill(axis=0)
|
|
@@ -736,10 +745,10 @@ def asc_lines_to_trials_by_trail_id(
|
|
| 736 |
if "question_correct" in trials_df.columns:
|
| 737 |
paragraph_trials_df = trials_df.loc[trials_df.trial_is == "paragraph", :]
|
| 738 |
overall_question_answer_value_counts = (
|
| 739 |
-
paragraph_trials_df["question_correct"].dropna().
|
| 740 |
)
|
| 741 |
overall_question_answer_value_counts_normed = (
|
| 742 |
-
paragraph_trials_df["question_correct"].dropna().
|
| 743 |
)
|
| 744 |
else:
|
| 745 |
overall_question_answer_value_counts = None
|
|
@@ -841,12 +850,12 @@ def asc_lines_to_trials_by_trail_id(
|
|
| 841 |
"char_xmax": float(parts[rg_idx + 6 + idx_correction]),
|
| 842 |
"char_ymax": float(parts[rg_idx + 7 + idx_correction]),
|
| 843 |
}
|
| 844 |
-
char_dict["char_y_center"] = round(
|
| 845 |
-
(char_dict["char_ymax"] - char_dict["char_ymin"]) / 2 + char_dict["char_ymin"]
|
| 846 |
-
)
|
| 847 |
-
char_dict["char_x_center"] = round(
|
| 848 |
-
(char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
|
| 849 |
-
)
|
| 850 |
chars_list.append(char_dict)
|
| 851 |
except Exception as e:
|
| 852 |
ic(f"char_dict creation failed for parts {parts}")
|
|
@@ -884,7 +893,7 @@ def asc_lines_to_trials_by_trail_id(
|
|
| 884 |
for idx in range(len(chars_list)):
|
| 885 |
chars_list[idx]["char_y_center"] = round(
|
| 886 |
(chars_list[idx]["char_ymax"] - chars_list[idx]["char_ymin"]) / 2 + chars_list[idx]["char_ymin"],
|
| 887 |
-
ndigits=
|
| 888 |
)
|
| 889 |
if chars_list[idx]["char_y_center"] not in line_ycoords:
|
| 890 |
line_ycoords.append(chars_list[idx]["char_y_center"])
|
|
|
|
| 1 |
import pickle
|
| 2 |
from io import StringIO
|
| 3 |
import re
|
| 4 |
+
import traceback
|
| 5 |
from typing import Dict, List
|
| 6 |
import zipfile
|
| 7 |
import os
|
|
|
|
| 200 |
|
| 201 |
def add_words(chars_list):
|
| 202 |
|
| 203 |
+
if not chars_list:
|
| 204 |
+
return [], []
|
| 205 |
+
|
| 206 |
chars_list_reconstructed = []
|
| 207 |
words_list = []
|
| 208 |
sentence_list = []
|
|
|
|
| 226 |
next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
|
| 227 |
if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
|
| 228 |
return False
|
| 229 |
+
# Treat as boundary if followed by space, end, or an uppercase letter (for no-space cases like "one.Two")
|
| 230 |
+
if next_c and next_c not in [None, " ", "\n"] and not (next_c.isupper() and next_c.isalpha()):
|
| 231 |
return False
|
| 232 |
return True
|
| 233 |
for idx, char_dict in enumerate(chars_list):
|
|
|
|
| 257 |
word_xmax = chars_list_reconstructed[-1]["char_xmax"]
|
| 258 |
word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
|
| 259 |
word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
|
| 260 |
+
word_x_center = int(round((word_xmax - word_xmin) / 2 + word_xmin))
|
| 261 |
+
word_y_center = int(round((word_ymax - word_ymin) / 2 + word_ymin))
|
| 262 |
word_length = len(word)
|
| 263 |
assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
|
| 264 |
word_dict = dict(
|
|
|
|
| 329 |
word_ymax=words_list[-1]["word_ymax"],
|
| 330 |
assigned_line=assigned_line,
|
| 331 |
)
|
| 332 |
+
word_x_center = int(round(
|
| 333 |
+
(words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"]
|
| 334 |
+
))
|
| 335 |
+
word_y_center = int(round(
|
| 336 |
+
(words_list[-1]["word_ymax"] - word_dict["word_ymin"]) / 2 + words_list[-1]["word_ymin"]
|
| 337 |
+
))
|
| 338 |
words_list[-1]["word_x_center"] = word_x_center
|
| 339 |
words_list[-1]["word_y_center"] = word_y_center
|
| 340 |
else:
|
|
|
|
| 418 |
if words_include_spaces:
|
| 419 |
ias_df[f"{prefix}_length"] = ias_df[prefix].map(lambda x: len(x) + 1)
|
| 420 |
ias_df[f"{prefix}_width_per_length"] = ias_df[f"{prefix}_width"] / ias_df[f"{prefix}_length"]
|
| 421 |
+
ias_df[f"{prefix}_xmax"] = [int(x) for x in (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_width_per_length"]).round(0)]
|
| 422 |
|
| 423 |
+
ias_df[f"{prefix}_x_center"] = [int(x) for x in (
|
| 424 |
(ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]) / 2 + ias_df[f"{prefix}_xmin"]
|
| 425 |
+
).round(0)]
|
| 426 |
+
ias_df[f"{prefix}_y_center"] = [int(x) for x in (
|
| 427 |
(ias_df[f"{prefix}_ymax"] - ias_df[f"{prefix}_ymin"]) / 2 + ias_df[f"{prefix}_ymin"]
|
| 428 |
+
).round(0)]
|
| 429 |
unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
|
| 430 |
assigned_lines = [unique_midlines.index(x) for x in ias_df[f"{prefix}_y_center"]]
|
| 431 |
ias_df["assigned_line"] = assigned_lines
|
|
|
|
| 444 |
char_dict = dict(
|
| 445 |
in_word_number=idx,
|
| 446 |
in_word=word,
|
| 447 |
+
char_xmin=round(row[f"{prefix}_xmin"] + i_w * letter_width, 0),
|
| 448 |
+
char_xmax=round(row[f"{prefix}_xmin"] + (i_w + 1) * letter_width, 0),
|
| 449 |
char_ymin=row[f"{prefix}_ymin"],
|
| 450 |
char_ymax=row[f"{prefix}_ymax"],
|
| 451 |
char=letter,
|
| 452 |
)
|
| 453 |
|
| 454 |
+
char_dict["char_x_center"] = int(round(
|
| 455 |
+
(char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
|
| 456 |
+
))
|
| 457 |
+
char_dict["char_y_center"] = int(round(
|
| 458 |
+
(row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
|
| 459 |
+
))
|
| 460 |
|
| 461 |
if i_w >= len(word) + 1:
|
| 462 |
break
|
| 463 |
+
try:
|
| 464 |
+
char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
|
| 465 |
+
except Exception as e:
|
| 466 |
+
ic(f"Error assigning line for char: {e}\n{traceback.format_exc()}")
|
| 467 |
chars_list.append(char_dict)
|
| 468 |
if chars_list[-1]["char"] != " " and row.assigned_line == next_row.assigned_line:
|
| 469 |
char_dict = dict(
|
| 470 |
char_xmin=chars_list[-1]["char_xmax"],
|
| 471 |
+
char_xmax=round(chars_list[-1]["char_xmax"] + letter_width, 0),
|
| 472 |
char_ymin=row[f"{prefix}_ymin"],
|
| 473 |
char_ymax=row[f"{prefix}_ymax"],
|
| 474 |
char=" ",
|
| 475 |
)
|
| 476 |
|
| 477 |
+
char_dict["char_x_center"] = int(round(
|
| 478 |
+
(char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
|
| 479 |
+
))
|
| 480 |
+
char_dict["char_y_center"] = int(round(
|
| 481 |
+
(row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
|
| 482 |
+
))
|
| 483 |
+
try:
|
| 484 |
+
char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
|
| 485 |
+
except Exception as e:
|
| 486 |
+
ic(f"Error assigning line for space char: {e}\n{traceback.format_exc()}")
|
| 487 |
chars_list.append(char_dict)
|
| 488 |
chars_df = pd.DataFrame(chars_list)
|
| 489 |
chars_df.loc[:, ["in_word_number", "in_word"]] = chars_df.loc[:, ["in_word_number", "in_word"]].copy().ffill(axis=0)
|
|
|
|
| 745 |
if "question_correct" in trials_df.columns:
|
| 746 |
paragraph_trials_df = trials_df.loc[trials_df.trial_is == "paragraph", :]
|
| 747 |
overall_question_answer_value_counts = (
|
| 748 |
+
paragraph_trials_df["question_correct"].dropna().map(int).value_counts().to_dict()
|
| 749 |
)
|
| 750 |
overall_question_answer_value_counts_normed = (
|
| 751 |
+
paragraph_trials_df["question_correct"].dropna().map(int).value_counts(normalize=True).to_dict()
|
| 752 |
)
|
| 753 |
else:
|
| 754 |
overall_question_answer_value_counts = None
|
|
|
|
| 850 |
"char_xmax": float(parts[rg_idx + 6 + idx_correction]),
|
| 851 |
"char_ymax": float(parts[rg_idx + 7 + idx_correction]),
|
| 852 |
}
|
| 853 |
+
char_dict["char_y_center"] = int(round(
|
| 854 |
+
(char_dict["char_ymax"] - char_dict["char_ymin"]) / 2 + char_dict["char_ymin"]
|
| 855 |
+
))
|
| 856 |
+
char_dict["char_x_center"] = int(round(
|
| 857 |
+
(char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
|
| 858 |
+
))
|
| 859 |
chars_list.append(char_dict)
|
| 860 |
except Exception as e:
|
| 861 |
ic(f"char_dict creation failed for parts {parts}")
|
|
|
|
| 893 |
for idx in range(len(chars_list)):
|
| 894 |
chars_list[idx]["char_y_center"] = round(
|
| 895 |
(chars_list[idx]["char_ymax"] - chars_list[idx]["char_ymin"]) / 2 + chars_list[idx]["char_ymin"],
|
| 896 |
+
ndigits=0,
|
| 897 |
)
|
| 898 |
if chars_list[idx]["char_y_center"] not in line_ycoords:
|
| 899 |
line_ycoords.append(chars_list[idx]["char_y_center"])
|