Spaces:

bugroup
/

GazeGenie

Sleeping

App Files Files Community

hugpv commited on Nov 12, 2025

Commit

8789d72

1 Parent(s): 7af3e73

sync with dev version

Browse files

Files changed (6) hide show

analysis_funcs.py +44 -25
app.py +0 -0
classic_correction_algos.py +1 -1
multi_proc_funcs.py +121 -45
popEye_funcs.py +58 -12
utils.py +51 -42

analysis_funcs.py CHANGED Viewed

@@ -29,11 +29,19 @@ def fix_in_ia_default(fixation, ia_row, prefix):
     )
-def number_of_fixations_own(trial, dffix, prefix, correction_algo):
     """
     Return the number of fixations on that interest area.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     counts = []
     for cidx, ia_row in ia_df.iterrows():
         count = 0
@@ -57,11 +65,11 @@ def number_of_fixations_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(counts)
-def initial_fixation_duration_own(trial, dffix, prefix, correction_algo):
     """
     The duration of the initial fixation on that interest area for each word.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     durations = []
     for cidx, ia_row in ia_df.iterrows():
@@ -81,8 +89,8 @@ def initial_fixation_duration_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(durations)
-def first_of_many_duration_own(trial, dffix, prefix, correction_algo):
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     durations = []
     for cidx, ia_row in ia_df.iterrows():
         fixation_durations = []
@@ -111,11 +119,11 @@ def first_of_many_duration_own(trial, dffix, prefix, correction_algo):
         return pd.DataFrame()
-def total_fixation_duration_own(trial, dffix, prefix, correction_algo):
     """
     sum duration of all fixations on that interest area.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     durations = []
     for cidx, ia_row in ia_df.iterrows():
         total_duration = 0
@@ -132,12 +140,12 @@ def total_fixation_duration_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(durations)
-def gaze_duration_own(trial, dffix, prefix, correction_algo):
     """
     Gaze duration is the sum duration of all fixations
     inside an interest area until the area is exited for the first time.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     durations = []
     for cidx, ia_row in ia_df.iterrows():
         duration = 0
@@ -158,7 +166,7 @@ def gaze_duration_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(durations)
-def go_past_duration_own(trial, dffix, prefix, correction_algo):
     """
     Given an interest area and fixation sequence, return the go-past time on
     that interest area. Go-past time is the sum duration of all fixations from
@@ -166,7 +174,7 @@ def go_past_duration_own(trial, dffix, prefix, correction_algo):
     the right, including any regressions to the left that occur during that
     time period (and vice versa in the case of right-to-left text).
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     results = []
     for cidx, ia_row in ia_df.iterrows():
@@ -190,12 +198,12 @@ def go_past_duration_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(results)
-def second_pass_duration_own(trial, dffix, prefix, correction_algo):
     """
     Given an interest area and fixation sequence, return the second pass
     duration on that interest area for each word.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     durations = []
     for cidx, ia_row in ia_df.iterrows():
@@ -224,13 +232,13 @@ def second_pass_duration_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(durations)
-def initial_landing_position_own(trial, dffix, prefix, correction_algo):
     """
     initial landing position (expressed in character positions) on that interest area.
     Counting is from 1. Returns `None` if no fixation
     landed on the interest area.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     if prefix == "word":
         chars_df = pd.DataFrame(trial[f"chars_list"])
     else:
@@ -266,7 +274,7 @@ def initial_landing_position_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(results)
-def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
     """
     Given an interest area and fixation sequence, return the initial landing
     distance on that interest area. The initial landing distance is the pixel
@@ -276,7 +284,7 @@ def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
     without including any padding. Returns `None` if no fixation landed on the
     interest area.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     distances = []
     for cidx, ia_row in ia_df.iterrows():
         initial_distance = None
@@ -296,12 +304,12 @@ def initial_landing_distance_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(distances)
-def landing_distances_own(trial, dffix, prefix, correction_algo):
     """
     Given an interest area and fixation sequence, return a dataframe with
     landing distances for each word in the interest area.
     """
-    ia_df = pd.DataFrame(trial[f"{prefix}s_list"])
     distances = []
     for cidx, ia_row in ia_df.iterrows():
         landing_distances = []
@@ -319,11 +327,15 @@ def landing_distances_own(trial, dffix, prefix, correction_algo):
     return pd.DataFrame(distances)
-def number_of_regressions_in_own(trial, dffix, prefix, correction_algo):
     word_reg_in_count = (
-        dffix.groupby([f"on_{prefix}_number_{correction_algo}", f"on_{prefix}_{correction_algo}"])[
-            f"{prefix}_reg_in_{correction_algo}"
-        ]
         .sum()
         .reset_index()
         .rename(
@@ -335,4 +347,11 @@ def number_of_regressions_in_own(trial, dffix, prefix, correction_algo):
         )
     )
-    return word_reg_in_count

     )
+def _get_interest_area_df(trial, prefix, ia_df=None):
+    """Return the interest-area dataframe, reusing a provided instance when available."""
+    if ia_df is not None:
+        return ia_df
+    return pd.DataFrame(trial[f"{prefix}s_list"])
+def number_of_fixations_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     Return the number of fixations on that interest area.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     counts = []
     for cidx, ia_row in ia_df.iterrows():
         count = 0
     return pd.DataFrame(counts)
+def initial_fixation_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     The duration of the initial fixation on that interest area for each word.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     durations = []
     for cidx, ia_row in ia_df.iterrows():
     return pd.DataFrame(durations)
+def first_of_many_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     durations = []
     for cidx, ia_row in ia_df.iterrows():
         fixation_durations = []
         return pd.DataFrame()
+def total_fixation_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     sum duration of all fixations on that interest area.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     durations = []
     for cidx, ia_row in ia_df.iterrows():
         total_duration = 0
     return pd.DataFrame(durations)
+def gaze_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     Gaze duration is the sum duration of all fixations
     inside an interest area until the area is exited for the first time.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     durations = []
     for cidx, ia_row in ia_df.iterrows():
         duration = 0
     return pd.DataFrame(durations)
+def go_past_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     Given an interest area and fixation sequence, return the go-past time on
     that interest area. Go-past time is the sum duration of all fixations from
     the right, including any regressions to the left that occur during that
     time period (and vice versa in the case of right-to-left text).
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     results = []
     for cidx, ia_row in ia_df.iterrows():
     return pd.DataFrame(results)
+def second_pass_duration_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     Given an interest area and fixation sequence, return the second pass
     duration on that interest area for each word.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     durations = []
     for cidx, ia_row in ia_df.iterrows():
     return pd.DataFrame(durations)
+def initial_landing_position_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     initial landing position (expressed in character positions) on that interest area.
     Counting is from 1. Returns `None` if no fixation
     landed on the interest area.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     if prefix == "word":
         chars_df = pd.DataFrame(trial[f"chars_list"])
     else:
     return pd.DataFrame(results)
+def initial_landing_distance_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     Given an interest area and fixation sequence, return the initial landing
     distance on that interest area. The initial landing distance is the pixel
     without including any padding. Returns `None` if no fixation landed on the
     interest area.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     distances = []
     for cidx, ia_row in ia_df.iterrows():
         initial_distance = None
     return pd.DataFrame(distances)
+def landing_distances_own(trial, dffix, prefix, correction_algo, ia_df=None):
     """
     Given an interest area and fixation sequence, return a dataframe with
     landing distances for each word in the interest area.
     """
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
     distances = []
     for cidx, ia_row in ia_df.iterrows():
         landing_distances = []
     return pd.DataFrame(distances)
+def number_of_regressions_in_own(trial, dffix, prefix, correction_algo, ia_df=None):
+    ia_df = _get_interest_area_df(trial, prefix, ia_df)
+    if f"{prefix}_number" not in ia_df.columns:
+        ia_df = ia_df.reset_index().rename(columns={"index": f"{prefix}_number"})
     word_reg_in_count = (
+        dffix.groupby([f"on_{prefix}_number_{correction_algo}", f"on_{prefix}_{correction_algo}"])
+        [f"{prefix}_reg_in_{correction_algo}"]
         .sum()
         .reset_index()
         .rename(
         )
     )
+    merged = ia_df[[f"{prefix}_number", prefix]].merge(
+        word_reg_in_count, how="left", on=[f"{prefix}_number", prefix]
+    )
+    merged[f"number_of_regressions_in_{correction_algo}"] = (
+        merged[f"number_of_regressions_in_{correction_algo}"].fillna(0).astype(int).tolist()
+    )
+    return merged

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

classic_correction_algos.py CHANGED Viewed

@@ -58,7 +58,7 @@ def apply_classic_algo(
             corrected_fix_y_vals = compare(fixation_array, np.array(word_center_list), **algo_params)
         else:
             raise NotImplementedError(f"{algo} not implemented")
-    corrected_fix_y_vals = np.round(corrected_fix_y_vals, decimals=2)
     corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_fix_y_vals]
     dffix[f"y_{algo}"] = corrected_fix_y_vals
     dffix[f"line_num_{algo}"] = corrected_line_nums

             corrected_fix_y_vals = compare(fixation_array, np.array(word_center_list), **algo_params)
         else:
             raise NotImplementedError(f"{algo} not implemented")
+    corrected_fix_y_vals = np.round(corrected_fix_y_vals, decimals=0).astype(int).tolist()
     corrected_line_nums = [trial["y_char_unique"].index(y) for y in corrected_fix_y_vals]
     dffix[f"y_{algo}"] = corrected_fix_y_vals
     dffix[f"line_num_{algo}"] = corrected_line_nums

multi_proc_funcs.py CHANGED Viewed

@@ -16,6 +16,7 @@ from matplotlib.font_manager import FontProperties
 from matplotlib.patches import Rectangle
 from tqdm.auto import tqdm
 import torch as t
 t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
 import plotly.express as px
 import copy
@@ -31,6 +32,29 @@ torch.multiprocessing.set_sharing_strategy('file_system') # Needed to make multi
 ic.configureOutput(includeContext=True)
 PLOTS_FOLDER = pl.Path("plots")
 event_strs = [
     "EFIX",
@@ -970,7 +994,7 @@ def get_raw_events_df_and_trial(trial, discard_fixations_without_sfix):
         if "index" not in chars_df.columns:
             chars_df.reset_index(inplace=True)
-        trial["chars_df"] = chars_df.to_dict()
         trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
     return reorder_columns(events_df), trial
@@ -1429,7 +1453,7 @@ def get_all_measures(
     for measure in measures_to_calculate:
         if hasattr(anf, f"{measure}_own"):
             function = getattr(anf, f"{measure}_own")
-            result = function(trial, dffix_copy, prefix, correction_algo)
             res_dfs.append(result)
     dfs_list = [df for df in [stim_df] + res_dfs if not df.empty]
     own_measure_df = stim_df
@@ -1478,6 +1502,12 @@ def get_all_measures(
         own_measure_df.to_csv(
             RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_{correction_algo}_word_measures.csv"
         )
     return own_measure_df
@@ -1838,7 +1868,7 @@ def get_DIST_preds(dffix, trial, models_dict):
         y_pred_DIST = [y_char_unique[idx] for idx in preds]
         dffix[f"line_num_{algo_choice}"] = preds
-        dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=2)
         dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(2)
     except Exception as e:
         ic(f"Exception on model(batch) for DIST \n{e}")
@@ -1868,7 +1898,7 @@ def get_DIST_ensemble_preds(
     y_pred_DIST = [y_char_unique[idx] for idx in preds]
     dffix[f"line_num_{algo_choice}"] = preds
-    dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=1)
     dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
     return dffix
@@ -1989,7 +2019,7 @@ def add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, xcol, cols_to_
         dffix["angle_outgoing"] = angle_outgoing
     dffix[f"line_change_{algo_choice}"] = np.concatenate(
         ([0], np.diff(dffix[f"line_num_{algo_choice}"])), axis=0
-    ).astype(int)
     for i in list(dffix.index):
         if dffix.loc[i, f"line_num_{algo_choice}"] > -1 and not pd.isna(dffix.loc[i, f"line_num_{algo_choice}"]):
@@ -2161,10 +2191,20 @@ def correct_df(
         algo_choices = [algo_choice]
         repeats = range(1)
-    chars_df = pd.DataFrame(trial["chars_df"]) if "chars_df" in trial else pd.DataFrame(trial["chars_list"])
     if for_multi:
         own_word_measures_dfs_for_algo = []
-        own_sentence_measures_dfs_for_algo = []
     trial["average_y_corrections"] = []
     for algoIdx in stqdm(repeats, desc="Applying line-assignment algorithms"):
         algo_choice = algo_choices[algoIdx]
@@ -2180,7 +2220,11 @@ def correct_df(
             fix_to_plot=["Uncorrected Fixations", "Corrected Fixations"],
             stim_info_to_plot=["Characters", "Word boxes"],
         )
-        savename = f"{trial['subject']}_{trial['trial_id']}_corr_{algo_choice}_fix.png"
         fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300)
         plt.close(fig)
         dffix = add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, "x", cols_to_add=fix_cols_to_add)
@@ -2195,15 +2239,16 @@ def correct_df(
                 measures_to_calculate=measures_to_calculate_multi_asc,
                 include_coords=include_coords_multi_asc,
             )
             own_word_measures_dfs_for_algo.append(own_word_measures)
-            sent_measures_multi = pf.compute_sentence_measures(
-                dffix, pd.DataFrame(trial["chars_df"]), algo_choice, sent_measures_to_calc_multi
-            )
-            own_sentence_measures_dfs_for_algo.append(sent_measures_multi)
     if for_multi and len(own_word_measures_dfs_for_algo) > 0:
         words_df = (
-            pd.DataFrame(trial["chars_df"])
             .drop_duplicates(subset="in_word_number", keep="first")
             .loc[:, ["in_word_number", "in_word"]]
             .rename({"in_word_number": "word_number", "in_word": "word"}, axis=1)
@@ -2222,41 +2267,53 @@ def correct_df(
             )
         words_df = reorder_columns(words_df, ["subject", "trial_id", "item", "condition", "word_number", "word"])
-        sentence_df = (
-            pd.DataFrame(trial["chars_df"])
-            .drop_duplicates(subset="in_sentence_number", keep="first")
-            .loc[
-                :,
-                [
-                    "in_sentence_number",
-                    "in_sentence",
-                ],
             ]
-            .rename({"in_sentence_number": "sentence_number", "in_sentence": "sentence"}, axis=1)
-            .reset_index(drop=True)
-        )
-        add_cols_from_trial(trial, sentence_df, cols=["item", "condition", "trial_id", "subject"])
-        sentence_df["subject_trialID"] = [
-            f"{id}_{num}" for id, num in zip(sentence_df["subject"], sentence_df["trial_id"])
-        ]
-        sentence_df = sentence_df.merge(
-            own_sentence_measures_dfs_for_algo[0],
-            how="left",
-            on=["item", "condition", "trial_id", "subject", "sentence_number", "sentence"],
-        )
-        for sent_measure_df in own_sentence_measures_dfs_for_algo[1:]:
             sentence_df = sentence_df.merge(
-                sent_measure_df,
                 how="left",
-                on=["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"],
             )
-        sentence_df = reorder_columns(
-            sentence_df, ["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"]
-        )
         trial["own_word_measures_dfs_for_algo"] = words_df
-        trial["own_sentence_measures_dfs_for_algo"] = sentence_df
     dffix = reorder_columns(dffix)
     if for_multi:
         return dffix
@@ -2313,10 +2370,25 @@ def process_trial_choice(
         merge_distance_threshold=merge_distance_threshold,
         discard_blinks=discard_blinks,
     )
     if "chars_list" in trial:
-        chars_df = pd.DataFrame(trial["chars_df"])
-        trial["chars_df"] = chars_df.to_dict()
         trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
     if algo_choice is not None and ("chars_list" in trial or "words_list" in trial):
         if dffix.shape[0] > 1:
@@ -2335,10 +2407,14 @@ def process_trial_choice(
             )
             saccade_df = get_saccade_df(dffix, trial, algo_choice, trial.pop("events_df"))
-            trial["saccade_df"] = saccade_df.to_dict()
             fig = plot_saccade_df(dffix, saccade_df, trial, True, False)
-            fig.savefig(RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_saccades.png")
             plt.close(fig)
         else:
             ic(

 from matplotlib.patches import Rectangle
 from tqdm.auto import tqdm
 import torch as t
+import logging
 t.classes.__path__ = [] # https://discuss.streamlit.io/t/error-in-torch-with-streamlit/90908/3
 import plotly.express as px
 import copy
 ic.configureOutput(includeContext=True)
+LOGGER = logging.getLogger(__name__)
+def ensure_dataframe(obj, *, label="data", context: str | None = None):
+    if isinstance(obj, pd.DataFrame):
+        return obj
+    if obj is None:
+        parts = [f"{label} is missing"]
+        if context:
+            parts.append(f"({context})")
+        message = " ".join(parts) + "."
+        LOGGER.error(message)
+        raise ValueError(message)
+    return pd.DataFrame(obj)
+def _make_hashable(value):
+    if isinstance(value, list):
+        return tuple(_make_hashable(v) for v in value)
+    if isinstance(value, dict):
+        return tuple(sorted((k, _make_hashable(v)) for k, v in value.items()))
+    return value
 PLOTS_FOLDER = pl.Path("plots")
 event_strs = [
     "EFIX",
         if "index" not in chars_df.columns:
             chars_df.reset_index(inplace=True)
+        trial["chars_df"] = chars_df
         trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
     return reorder_columns(events_df), trial
     for measure in measures_to_calculate:
         if hasattr(anf, f"{measure}_own"):
             function = getattr(anf, f"{measure}_own")
+            result = function(trial, dffix_copy, prefix, correction_algo, stim_df)
             res_dfs.append(result)
     dfs_list = [df for df in [stim_df] + res_dfs if not df.empty]
     own_measure_df = stim_df
         own_measure_df.to_csv(
             RESULTS_FOLDER / f"{trial['subject']}_{trial['trial_id']}_{correction_algo}_word_measures.csv"
         )
+    object_cols = own_measure_df.select_dtypes(include="object").columns
+    if len(object_cols) > 0:
+        for col in object_cols:
+            own_measure_df[col] = own_measure_df[col].map(_make_hashable)
     return own_measure_df
         y_pred_DIST = [y_char_unique[idx] for idx in preds]
         dffix[f"line_num_{algo_choice}"] = preds
+        dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=0).astype(int).tolist()
         dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(2)
     except Exception as e:
         ic(f"Exception on model(batch) for DIST \n{e}")
     y_pred_DIST = [y_char_unique[idx] for idx in preds]
     dffix[f"line_num_{algo_choice}"] = preds
+    dffix[f"y_{algo_choice}"] = np.round(y_pred_DIST, decimals=0).astype(int).tolist()
     dffix[f"y_{algo_choice}_correction"] = (dffix.loc[:, f"y_{algo_choice}"] - dffix.loc[:, "y"]).round(1)
     return dffix
         dffix["angle_outgoing"] = angle_outgoing
     dffix[f"line_change_{algo_choice}"] = np.concatenate(
         ([0], np.diff(dffix[f"line_num_{algo_choice}"])), axis=0
+    ).astype(int).tolist()
     for i in list(dffix.index):
         if dffix.loc[i, f"line_num_{algo_choice}"] > -1 and not pd.isna(dffix.loc[i, f"line_num_{algo_choice}"]):
         algo_choices = [algo_choice]
         repeats = range(1)
+    chars_df_source = trial.get("chars_df")
+    if chars_df_source is None:
+        chars_df_source = trial.get("chars_list")
+    trial_context = f"trial {trial.get('trial_id', 'unknown')}"
+    if trial.get("subject"):
+        trial_context = f"{trial.get('subject')} / {trial_context}"
+    chars_df = ensure_dataframe(
+        chars_df_source,
+        label="Character stimulus data",
+        context=trial_context,
+    )
     if for_multi:
         own_word_measures_dfs_for_algo = []
+    own_sentence_measures_dfs_for_algo = []
     trial["average_y_corrections"] = []
     for algoIdx in stqdm(repeats, desc="Applying line-assignment algorithms"):
         algo_choice = algo_choices[algoIdx]
             fix_to_plot=["Uncorrected Fixations", "Corrected Fixations"],
             stim_info_to_plot=["Characters", "Word boxes"],
         )
+        subject_for_name = trial.get("subject") or (
+            pl.Path(trial["filename"]).stem if trial.get("filename") else "unknown_subject"
+        )
+        trial_id_for_name = trial.get("trial_id") or "unknown_trial"
+        savename = f"{subject_for_name}_{trial_id_for_name}_corr_{algo_choice}_fix.png"
         fig.savefig(RESULTS_FOLDER.joinpath(savename), dpi=300)
         plt.close(fig)
         dffix = add_popEye_cols_to_dffix(dffix, algo_choice, chars_df, trial, "x", cols_to_add=fix_cols_to_add)
                 measures_to_calculate=measures_to_calculate_multi_asc,
                 include_coords=include_coords_multi_asc,
             )
+            if 'item' not in own_word_measures.columns:
+                add_cols_from_trial(trial, own_word_measures, cols=["item", "condition", "trial_id", "subject"])
             own_word_measures_dfs_for_algo.append(own_word_measures)
+            if sent_measures_to_calc_multi:
+                sent_measures_multi = pf.compute_sentence_measures(dffix, chars_df, algo_choice, sent_measures_to_calc_multi)
+                own_sentence_measures_dfs_for_algo.append(sent_measures_multi)
     if for_multi and len(own_word_measures_dfs_for_algo) > 0:
         words_df = (
+            chars_df
             .drop_duplicates(subset="in_word_number", keep="first")
             .loc[:, ["in_word_number", "in_word"]]
             .rename({"in_word_number": "word_number", "in_word": "word"}, axis=1)
             )
         words_df = reorder_columns(words_df, ["subject", "trial_id", "item", "condition", "word_number", "word"])
+        sentence_df = None
+        if own_sentence_measures_dfs_for_algo:
+            sentence_df = (
+                chars_df
+                .drop_duplicates(subset="in_sentence_number", keep="first")
+                .loc[
+                    :,
+                    [
+                        "in_sentence_number",
+                        "in_sentence",
+                    ],
+                ]
+                .rename({"in_sentence_number": "sentence_number", "in_sentence": "sentence"}, axis=1)
+                .reset_index(drop=True)
+            )
+            add_cols_from_trial(trial, sentence_df, cols=["item", "condition", "trial_id", "subject"])
+            sentence_df["subject_trialID"] = [
+                f"{id}_{num}" for id, num in zip(sentence_df["subject"], sentence_df["trial_id"])
             ]
             sentence_df = sentence_df.merge(
+                own_sentence_measures_dfs_for_algo[0],
                 how="left",
+                on=["item", "condition", "trial_id", "subject", "sentence_number", "sentence"],
+            )
+            for sent_measure_df in own_sentence_measures_dfs_for_algo[1:]:
+                sentence_df = sentence_df.merge(
+                    sent_measure_df,
+                    how="left",
+                    on=[
+                        "subject",
+                        "trial_id",
+                        "item",
+                        "condition",
+                        "sentence_number",
+                        "sentence",
+                        "number_of_words",
+                    ],
+                )
+            sentence_df = reorder_columns(
+                sentence_df,
+                ["subject", "trial_id", "item", "condition", "sentence_number", "sentence", "number_of_words"],
             )
         trial["own_word_measures_dfs_for_algo"] = words_df
+        if sentence_df is not None:
+            trial["own_sentence_measures_dfs_for_algo"] = sentence_df
     dffix = reorder_columns(dffix)
     if for_multi:
         return dffix
         merge_distance_threshold=merge_distance_threshold,
         discard_blinks=discard_blinks,
     )
+    if not trial.get("subject"):
+        filename = trial.get("filename")
+        if filename:
+            trial["subject"] = pl.Path(filename).stem
     if "chars_list" in trial:
+        chars_df_source = trial.get("chars_df")
+        if chars_df_source is None:
+            chars_df_source = trial.get("chars_list")
+        trial_context = f"trial {trial.get('trial_id', 'unknown')}"
+        if trial.get("subject"):
+            trial_context = f"{trial.get('subject')} / {trial_context}"
+        chars_df = ensure_dataframe(
+            chars_df_source,
+            label="Character stimulus data",
+            context=trial_context,
+        )
+        trial["chars_df"] = chars_df
         trial["y_char_unique"] = list(chars_df.char_y_center.sort_values().unique())
     if algo_choice is not None and ("chars_list" in trial or "words_list" in trial):
         if dffix.shape[0] > 1:
             )
             saccade_df = get_saccade_df(dffix, trial, algo_choice, trial.pop("events_df"))
+            trial["saccade_df"] = saccade_df
             fig = plot_saccade_df(dffix, saccade_df, trial, True, False)
+            subject_for_name = trial.get("subject") or (
+                pl.Path(trial["filename"]).stem if trial.get("filename") else "unknown_subject"
+            )
+            trial_id_for_name = trial.get("trial_id") or "unknown_trial"
+            fig.savefig(RESULTS_FOLDER / f"{subject_for_name}_{trial_id_for_name}_saccades.png")
             plt.close(fig)
         else:
             ic(

popEye_funcs.py CHANGED Viewed

@@ -1258,16 +1258,26 @@ def aggregate_trials(dffix_combined, wordcomb, all_trials_by_subj, algo_choices)
     trial = trial[names].copy()
     for index, row in trial.iterrows():
         selected_trial = all_trials_by_subj[row["subject"]][row["trial_id"]]
         info_keys = [
             k for k in selected_trial.keys() if k in ["trial_start_time", "trial_end_time", "question_correct"]
         ]
-        if row["subject"] in all_trials_by_subj and row["trial_id"] in all_trials_by_subj[row["subject"]]:
-            if selected_trial["Fixation Cleaning Stats"]["Discard fixation before or after blinks"]:
-                trial.at[index, "blink"] = selected_trial["Fixation Cleaning Stats"][
-                    "Number of discarded fixations due to blinks"
-                ]
-            for key, value in selected_trial.items():
                 if key in info_keys:
                     trial.at[index, key] = value
@@ -1349,10 +1359,45 @@ def aggregate_trials(dffix_combined, wordcomb, all_trials_by_subj, algo_choices)
 def aggregate_subjects(trials, algo_choices):
-    trial_aggregates = trials.groupby("subject")[["nfix", "blink"]].mean().round(3).reset_index()
-    trial_aggregates = trial_aggregates.merge(
-        trials.groupby("subject")["question_correct"].sum().reset_index(name="n_question_correct"), on="subject"
-    )
     trial_aggregates = trial_aggregates.merge(
         trials.groupby("subject")["trial_id"].count().reset_index(name="ntrial"), on="subject"
     )
@@ -1371,7 +1416,8 @@ def aggregate_subjects(trials, algo_choices):
             ]
             if c in trials.columns
         ]
-        trial_aggregates_temp = trials.groupby("subject")[cols_to_do].mean().round(3).reset_index()
-        trial_aggregates = pd.merge(trial_aggregates, trial_aggregates_temp, how="left", on="subject")
     return trial_aggregates

     trial = trial[names].copy()
     for index, row in trial.iterrows():
+        # Check if subject and trial exist in all_trials_by_subj
+        if row["subject"] not in all_trials_by_subj:
+            print(f"Warning: Subject '{row['subject']}' not found in all_trials_by_subj. Skipping trial {row['trial_id']}.")
+            continue
+        if row["trial_id"] not in all_trials_by_subj[row["subject"]]:
+            print(f"Warning: Trial '{row['trial_id']}' not found for subject '{row['subject']}'. Skipping.")
+            continue
         selected_trial = all_trials_by_subj[row["subject"]][row["trial_id"]]
         info_keys = [
             k for k in selected_trial.keys() if k in ["trial_start_time", "trial_end_time", "question_correct"]
         ]
+        # Check if Fixation Cleaning Stats exists
+        if "Fixation Cleaning Stats" in selected_trial:
+            if selected_trial["Fixation Cleaning Stats"].get("Discard fixation before or after blinks", False):
+                trial.at[index, "blink"] = selected_trial["Fixation Cleaning Stats"].get(
+                    "Number of discarded fixations due to blinks", 0
+                )
+        for key, value in selected_trial.items():
                 if key in info_keys:
                     trial.at[index, key] = value
 def aggregate_subjects(trials, algo_choices):
+    base_cols = [col for col in ["nfix", "blink"] if col in trials.columns]
+    if base_cols:
+        trial_aggregates = trials.groupby("subject")[base_cols].mean().round(3).reset_index()
+    else:
+        trial_aggregates = trials[["subject"]].drop_duplicates().reset_index(drop=True)
+    if "question_correct" in trials.columns:
+        qc_series = trials["question_correct"].copy()
+        if qc_series.dtype == "object":
+            qc_series = qc_series.replace(
+                {
+                    "True": True,
+                    "true": True,
+                    "FALSE": False,
+                    "False": False,
+                    "false": False,
+                    "TRUE": True,
+                    "": pd.NA,
+                    None: pd.NA,
+                }
+            )
+        try:
+            qc_boolean = qc_series.astype("boolean")
+        except (TypeError, ValueError):
+            qc_boolean = qc_series.apply(lambda x: bool(x) if pd.notna(x) else pd.NA).astype("boolean")
+        qc_int = qc_boolean.astype("Int64")
+        question_counts = (
+            qc_int.groupby(trials["subject"])
+            .sum(min_count=0)
+            .fillna(0)
+            .astype("Int64")
+            .rename("n_question_correct")
+            .reset_index()
+        )
+        trial_aggregates = trial_aggregates.merge(question_counts, on="subject", how="left")
+        trial_aggregates["n_question_correct"] = trial_aggregates["n_question_correct"].astype("Int64")
+    else:
+        trial_aggregates["n_question_correct"] = pd.Series(0, index=trial_aggregates.index, dtype="Int64")
     trial_aggregates = trial_aggregates.merge(
         trials.groupby("subject")["trial_id"].count().reset_index(name="ntrial"), on="subject"
     )
             ]
             if c in trials.columns
         ]
+        if cols_to_do:
+            trial_aggregates_temp = trials.groupby("subject")[cols_to_do].mean().round(3).reset_index()
+            trial_aggregates = pd.merge(trial_aggregates, trial_aggregates_temp, how="left", on="subject")
     return trial_aggregates

utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import pickle
 from io import StringIO
 import re
 from typing import Dict, List
 import zipfile
 import os
@@ -199,6 +200,9 @@ def calc_xdiff_ydiff(line_xcoords_no_pad, line_ycoords_no_pad, line_heights, all
 def add_words(chars_list):
     chars_list_reconstructed = []
     words_list = []
     sentence_list = []
@@ -222,8 +226,8 @@ def add_words(chars_list):
             next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
         if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
             return False
-        # Optionally: only treat as boundary if followed by space or end
-        if next_c and next_c not in [None, " ", "\n"]:
             return False
         return True
     for idx, char_dict in enumerate(chars_list):
@@ -253,8 +257,8 @@ def add_words(chars_list):
                 word_xmax = chars_list_reconstructed[-1]["char_xmax"]
             word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
             word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
-            word_x_center = round((word_xmax - word_xmin) / 2 + word_xmin, ndigits=2)
-            word_y_center = chars_list_reconstructed[word_start_idx]["char_y_center"]
             word_length = len(word)
             assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
             word_dict = dict(
@@ -325,12 +329,12 @@ def add_words(chars_list):
                 word_ymax=words_list[-1]["word_ymax"],
                 assigned_line=assigned_line,
             )
-            word_x_center = round(
-                (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
-            )
-            word_y_center = round(
-                (words_list[-1]["word_ymax"] - word_dict["word_ymin"]) / 2 + words_list[-1]["word_ymin"], ndigits=2
-            )
             words_list[-1]["word_x_center"] = word_x_center
             words_list[-1]["word_y_center"] = word_y_center
         else:
@@ -414,14 +418,14 @@ def read_ias_file(ias_file, prefix):
     if words_include_spaces:
         ias_df[f"{prefix}_length"] = ias_df[prefix].map(lambda x: len(x) + 1)
         ias_df[f"{prefix}_width_per_length"] = ias_df[f"{prefix}_width"] / ias_df[f"{prefix}_length"]
-        ias_df[f"{prefix}_xmax"] = (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_width_per_length"]).round(2)
-    ias_df[f"{prefix}_x_center"] = (
         (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]) / 2 + ias_df[f"{prefix}_xmin"]
-    ).round(2)
-    ias_df[f"{prefix}_y_center"] = (
         (ias_df[f"{prefix}_ymax"] - ias_df[f"{prefix}_ymin"]) / 2 + ias_df[f"{prefix}_ymin"]
-    ).round(2)
     unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
     assigned_lines = [unique_midlines.index(x) for x in ias_df[f"{prefix}_y_center"]]
     ias_df["assigned_line"] = assigned_lines
@@ -440,41 +444,46 @@ def get_chars_list_from_words_list(ias_df, prefix="word"):
             char_dict = dict(
                 in_word_number=idx,
                 in_word=word,
-                char_xmin=round(row[f"{prefix}_xmin"] + i_w * letter_width, 2),
-                char_xmax=round(row[f"{prefix}_xmin"] + (i_w + 1) * letter_width, 2),
                 char_ymin=row[f"{prefix}_ymin"],
                 char_ymax=row[f"{prefix}_ymax"],
                 char=letter,
             )
-            char_dict["char_x_center"] = round(
-                (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
-            )
-            char_dict["char_y_center"] = round(
-                (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"], ndigits=2
-            )
             if i_w >= len(word) + 1:
                 break
-            char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
             chars_list.append(char_dict)
         if chars_list[-1]["char"] != " " and row.assigned_line == next_row.assigned_line:
             char_dict = dict(
                 char_xmin=chars_list[-1]["char_xmax"],
-                char_xmax=round(chars_list[-1]["char_xmax"] + letter_width, 2),
                 char_ymin=row[f"{prefix}_ymin"],
                 char_ymax=row[f"{prefix}_ymax"],
                 char=" ",
             )
-            char_dict["char_x_center"] = round(
-                (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
-            )
-            char_dict["char_y_center"] = round(
-                (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"], ndigits=2
-            )
-            char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
             chars_list.append(char_dict)
     chars_df = pd.DataFrame(chars_list)
     chars_df.loc[:, ["in_word_number", "in_word"]] = chars_df.loc[:, ["in_word_number", "in_word"]].copy().ffill(axis=0)
@@ -736,10 +745,10 @@ def asc_lines_to_trials_by_trail_id(
     if "question_correct" in trials_df.columns:
         paragraph_trials_df = trials_df.loc[trials_df.trial_is == "paragraph", :]
         overall_question_answer_value_counts = (
-            paragraph_trials_df["question_correct"].dropna().astype(int).value_counts().to_dict()
         )
         overall_question_answer_value_counts_normed = (
-            paragraph_trials_df["question_correct"].dropna().astype(int).value_counts(normalize=True).to_dict()
         )
     else:
         overall_question_answer_value_counts = None
@@ -841,12 +850,12 @@ def asc_lines_to_trials_by_trail_id(
                         "char_xmax": float(parts[rg_idx + 6 + idx_correction]),
                         "char_ymax": float(parts[rg_idx + 7 + idx_correction]),
                     }
-                    char_dict["char_y_center"] = round(
-                        (char_dict["char_ymax"] - char_dict["char_ymin"]) / 2 + char_dict["char_ymin"], ndigits=2
-                    )
-                    char_dict["char_x_center"] = round(
-                        (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"], ndigits=2
-                    )
                     chars_list.append(char_dict)
                 except Exception as e:
                     ic(f"char_dict creation failed for parts {parts}")
@@ -884,7 +893,7 @@ def asc_lines_to_trials_by_trail_id(
             for idx in range(len(chars_list)):
                 chars_list[idx]["char_y_center"] = round(
                     (chars_list[idx]["char_ymax"] - chars_list[idx]["char_ymin"]) / 2 + chars_list[idx]["char_ymin"],
-                    ndigits=2,
                 )
                 if chars_list[idx]["char_y_center"] not in line_ycoords:
                     line_ycoords.append(chars_list[idx]["char_y_center"])

 import pickle
 from io import StringIO
 import re
+import traceback
 from typing import Dict, List
 import zipfile
 import os
 def add_words(chars_list):
+    if not chars_list:
+        return [], []
     chars_list_reconstructed = []
     words_list = []
     sentence_list = []
             next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
         if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
             return False
+        # Treat as boundary if followed by space, end, or an uppercase letter (for no-space cases like "one.Two")
+        if next_c and next_c not in [None, " ", "\n"] and not (next_c.isupper() and next_c.isalpha()):
             return False
         return True
     for idx, char_dict in enumerate(chars_list):
                 word_xmax = chars_list_reconstructed[-1]["char_xmax"]
             word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
             word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
+            word_x_center = int(round((word_xmax - word_xmin) / 2 + word_xmin))
+            word_y_center = int(round((word_ymax - word_ymin) / 2 + word_ymin))
             word_length = len(word)
             assigned_line = int(chars_list_reconstructed[word_start_idx]["assigned_line"])
             word_dict = dict(
                 word_ymax=words_list[-1]["word_ymax"],
                 assigned_line=assigned_line,
             )
+            word_x_center = int(round(
+                (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"]
+            ))
+            word_y_center = int(round(
+                (words_list[-1]["word_ymax"] - word_dict["word_ymin"]) / 2 + words_list[-1]["word_ymin"]
+            ))
             words_list[-1]["word_x_center"] = word_x_center
             words_list[-1]["word_y_center"] = word_y_center
         else:
     if words_include_spaces:
         ias_df[f"{prefix}_length"] = ias_df[prefix].map(lambda x: len(x) + 1)
         ias_df[f"{prefix}_width_per_length"] = ias_df[f"{prefix}_width"] / ias_df[f"{prefix}_length"]
+        ias_df[f"{prefix}_xmax"] = [int(x) for x in (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_width_per_length"]).round(0)]
+    ias_df[f"{prefix}_x_center"] = [int(x) for x in (
         (ias_df[f"{prefix}_xmax"] - ias_df[f"{prefix}_xmin"]) / 2 + ias_df[f"{prefix}_xmin"]
+    ).round(0)]
+    ias_df[f"{prefix}_y_center"] = [int(x) for x in (
         (ias_df[f"{prefix}_ymax"] - ias_df[f"{prefix}_ymin"]) / 2 + ias_df[f"{prefix}_ymin"]
+    ).round(0)]
     unique_midlines = list(np.unique(ias_df[f"{prefix}_y_center"]))
     assigned_lines = [unique_midlines.index(x) for x in ias_df[f"{prefix}_y_center"]]
     ias_df["assigned_line"] = assigned_lines
             char_dict = dict(
                 in_word_number=idx,
                 in_word=word,
+                char_xmin=round(row[f"{prefix}_xmin"] + i_w * letter_width, 0),
+                char_xmax=round(row[f"{prefix}_xmin"] + (i_w + 1) * letter_width, 0),
                 char_ymin=row[f"{prefix}_ymin"],
                 char_ymax=row[f"{prefix}_ymax"],
                 char=letter,
             )
+            char_dict["char_x_center"] = int(round(
+                (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
+            ))
+            char_dict["char_y_center"] = int(round(
+                (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
+            ))
             if i_w >= len(word) + 1:
                 break
+            try:
+                char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
+            except Exception as e:
+                ic(f"Error assigning line for char: {e}\n{traceback.format_exc()}")
             chars_list.append(char_dict)
         if chars_list[-1]["char"] != " " and row.assigned_line == next_row.assigned_line:
             char_dict = dict(
                 char_xmin=chars_list[-1]["char_xmax"],
+                char_xmax=round(chars_list[-1]["char_xmax"] + letter_width, 0),
                 char_ymin=row[f"{prefix}_ymin"],
                 char_ymax=row[f"{prefix}_ymax"],
                 char=" ",
             )
+            char_dict["char_x_center"] = int(round(
+                (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
+            ))
+            char_dict["char_y_center"] = int(round(
+                (row[f"{prefix}_ymax"] - row[f"{prefix}_ymin"]) / 2 + row[f"{prefix}_ymin"]
+            ))
+            try:
+                char_dict["assigned_line"] = unique_midlines.index(char_dict["char_y_center"])
+            except Exception as e:
+                ic(f"Error assigning line for space char: {e}\n{traceback.format_exc()}")
             chars_list.append(char_dict)
     chars_df = pd.DataFrame(chars_list)
     chars_df.loc[:, ["in_word_number", "in_word"]] = chars_df.loc[:, ["in_word_number", "in_word"]].copy().ffill(axis=0)
     if "question_correct" in trials_df.columns:
         paragraph_trials_df = trials_df.loc[trials_df.trial_is == "paragraph", :]
         overall_question_answer_value_counts = (
+            paragraph_trials_df["question_correct"].dropna().map(int).value_counts().to_dict()
         )
         overall_question_answer_value_counts_normed = (
+            paragraph_trials_df["question_correct"].dropna().map(int).value_counts(normalize=True).to_dict()
         )
     else:
         overall_question_answer_value_counts = None
                         "char_xmax": float(parts[rg_idx + 6 + idx_correction]),
                         "char_ymax": float(parts[rg_idx + 7 + idx_correction]),
                     }
+                    char_dict["char_y_center"] = int(round(
+                        (char_dict["char_ymax"] - char_dict["char_ymin"]) / 2 + char_dict["char_ymin"]
+                    ))
+                    char_dict["char_x_center"] = int(round(
+                        (char_dict["char_xmax"] - char_dict["char_xmin"]) / 2 + char_dict["char_xmin"]
+                    ))
                     chars_list.append(char_dict)
                 except Exception as e:
                     ic(f"char_dict creation failed for parts {parts}")
             for idx in range(len(chars_list)):
                 chars_list[idx]["char_y_center"] = round(
                     (chars_list[idx]["char_ymax"] - chars_list[idx]["char_ymin"]) / 2 + chars_list[idx]["char_ymin"],
+                    ndigits=0,
                 )
                 if chars_list[idx]["char_y_center"] not in line_ycoords:
                     line_ycoords.append(chars_list[idx]["char_y_center"])