Spaces:

bugroup
/

GazeGenie

Running

App Files Files Community

hugpv commited on Sep 20, 2025

Commit

7af3e73

1 Parent(s): 5f3a1bb

synced with dev version mainly sent measure related

Browse files

Files changed (5) hide show

app.py +69 -41
multi_proc_funcs.py +8 -3
popEye_funcs.py +6 -2
process_asc_files_in_multi_p.py +18 -0
utils.py +45 -19

app.py CHANGED Viewed

@@ -1483,13 +1483,13 @@ def main():
             events_df_expander_single.markdown("### Fixations")
             events_df_expander_single.dataframe(
                 events_df[events_df["msg"] == "FIX"].dropna(how="all", axis=1).copy(),
-                use_container_width=True,
                 height=200,
             )
             events_df_expander_single.markdown("### Saccades")
             events_df_expander_single.dataframe(
                 events_df[events_df["msg"] == "SAC"].dropna(how="all", axis=1).copy(),
-                use_container_width=True,
                 height=200,
             )
             if not events_df[events_df["msg"] == "BLINK"].empty:
@@ -1498,7 +1498,7 @@ def main():
                 blinksdf = blinksdf.drop(
                     columns=[c for c in blinksdf.columns if c in ["blink", "blink_after", "blink_before"]]
                 )
-                events_df_expander_single.dataframe(blinksdf, use_container_width=True, height=200)
             show_cleaning_options(single_file_tab_asc_tab, events_df[events_df["msg"] == "FIX"], "single_asc")
         if "dffix_cleaned_single_asc" in st.session_state and "trial_single_asc" in st.session_state:
@@ -1589,12 +1589,12 @@ def main():
                     chars_colnames_markdown = read_chars_col_names()
                     st.markdown(chars_colnames_markdown)
                 df_stim_expander_single.dataframe(
-                    pd.DataFrame(trial["chars_list"]), use_container_width=True, height=200
                 )
                 if "words_list" in trial:
                     df_stim_expander_single.markdown("### Words dataframe")
                     df_stim_expander_single.dataframe(
-                        pd.DataFrame(trial["words_list"]), use_container_width=True, height=200
                     )
             else:
                 st.warning("🚨 No stimulus information in session state")
@@ -1682,7 +1682,7 @@ def main():
                         font=selected_plotting_font_single_asc,
                         lines_in_plot=lines_in_plot_single_asc,
                     ),
-                    use_container_width=False,
                 )
                 plot_expander_single.markdown("#### Saccades")
@@ -1712,11 +1712,11 @@ def main():
                         font=selected_plotting_font_single_asc,
                         lines_in_plot=lines_in_plot_single_asc,
                     ),
-                    use_container_width=False,
                 )
                 plot_expander_single.markdown("#### Y-coordinate correction due to line-assignment")
                 plot_expander_single.plotly_chart(
-                    plot_y_corr(dffix, st.session_state["algo_choice_single_asc"]), use_container_width=True
                 )
             if "average_y_corrections" in trial:
                 plot_expander_single.markdown(
@@ -1822,7 +1822,7 @@ def main():
                             with open("word_measures.md", "r") as f:
                                 word_measure_colnames_markdown = "\n".join(f.readlines())
                             st.markdown(word_measure_colnames_markdown)
-                        st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
                         own_word_measures_csv = convert_df(own_word_measures)
                         subject = st.session_state["trial_single_asc"]["subject"]
                         trial_id = st.session_state["trial_single_asc"]["trial_id"]
@@ -1868,7 +1868,7 @@ def main():
                             st.markdown(sentence_measure_colnames_markdown)
                         st.dataframe(
                             st.session_state["own_sent_measures_single_asc"],
-                            use_container_width=True,
                             hide_index=True,
                             height=200,
                         )
@@ -1951,7 +1951,7 @@ def main():
                                 get_char_measures=False,
                             )
-                            st.dataframe(word_measures_df, use_container_width=True, hide_index=True, height=200)
                             word_measures_df_csv = convert_df(word_measures_df)
                             st.download_button(
@@ -1975,7 +1975,7 @@ def main():
                             if character_measures_df is not None:
                                 st.dataframe(
-                                    character_measures_df, use_container_width=True, hide_index=True, height=200
                                 )
             else:
                 single_file_tab_asc_tab.warning("🚨 Stimulus information needed for analysis 🚨")
@@ -2006,7 +2006,7 @@ def main():
             accept_multiple_files=False,
             key="single_csv_file_stim_uploaded",
             type={"json", "csv", "txt", "dat","jpeg","png"},
-            help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes. This can be left blank if you chose to use the examples.",
         )
         st.checkbox(
@@ -2043,7 +2043,19 @@ def main():
         ]:
             if k in st.session_state:
                 del st.session_state[k]
         if use_example_or_uploaded_file_choice != "Example Files":
             st.session_state["dffix_single_csv"] = load_csv_delim_agnostic(single_csv_file)
             st.session_state["dffix_col_mappings_guess_single_csv"] = find_col_name_suggestions(
@@ -2063,6 +2075,8 @@ def main():
             elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
                 stimdf_single_csv = recognize_text(single_csv_stim_file)
                 stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
                 st.session_state["stimdf_single_csv"] = stimdf_single_csv
                 colnames_stim = st.session_state["stimdf_single_csv"].columns
             else:
@@ -2071,6 +2085,10 @@ def main():
             st.session_state["chars_df_col_mappings_guess_single_csv"] = find_col_name_suggestions(
                 list(colnames_stim), COLNAMES_CUSTOM_CSV_STIM
             )
         else:
             with open(EXAMPLE_CUSTOM_JSON_FILE, "r") as json_file:
                 json_string = json_file.read()
@@ -2086,7 +2104,7 @@ def main():
             if in_st_nn("dffix_single_csv"):
                 st.dataframe(
                     st.session_state["dffix_single_csv"],
-                    use_container_width=True,
                     hide_index=True,
                     on_select="ignore",
                     height=200,
@@ -2097,7 +2115,7 @@ def main():
                 else:
                     st.dataframe(
                         st.session_state["stimdf_single_csv"],
-                        use_container_width=True,
                         hide_index=True,
                         on_select="ignore",
                         height=200,
@@ -2314,7 +2332,7 @@ def main():
             help="This downloads the corrected fixations dataframe as a .csv file with the filename containing the trial id.",
         )
         with single_file_tab_csv_tab.expander("Show corrected fixation data", expanded=True):
-            st.dataframe(dffix, use_container_width=True, hide_index=True, height=200)
         with single_file_tab_csv_tab.expander("Show fixation plots", expanded=True):
             plotting_checkboxes_single_single_csv = st.multiselect(
@@ -2332,9 +2350,9 @@ def main():
                     to_plot_list=plotting_checkboxes_single_single_csv,
                     algo_choice=st.session_state["algo_choice_single_csv"],
                 ),
-                use_container_width=True,
             )
-            st.plotly_chart(plot_y_corr(dffix, st.session_state["algo_choice_single_csv"]), use_container_width=True)
             plotlist = [x for x in dffix.columns if "Unnamed" not in str(x)]
             plot_choice = st.multiselect(
                 "Which measures should be visualized?",
@@ -2342,7 +2360,7 @@ def main():
                 key="plot_choice_fix_measure",
                 default=plotlist[-1],
             )
-            st.plotly_chart(plot_fix_measure(dffix, plot_choice, "Index"), use_container_width=True)
         if "chars_list" in trial:
             analysis_expander_custom = single_file_tab_csv_tab.expander("Show Analysis results", True)
@@ -2419,7 +2437,7 @@ def main():
                     fixations_tuples, textblock_input_dict, trial=trial, get_char_measures=False
                 )
-                st.dataframe(word_measures_df, use_container_width=True, hide_index=True, height=200)
                 word_measures_df_csv = convert_df(word_measures_df)
                 st.download_button(
@@ -2435,7 +2453,7 @@ def main():
                 st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
                 if character_measures_df is not None:
-                    st.dataframe(character_measures_df, use_container_width=True, hide_index=True, height=200)
             with own_analysis_tab_custom:
                 st.markdown(
@@ -2450,9 +2468,19 @@ def main():
                     save_to_csv=True,
                     measures_to_calculate = ALL_MEASURES_OWN
                 )
-                st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
                 own_word_measures_csv = convert_df(own_word_measures)
                 st.download_button(
                     "⏬ Download word measures data",
                     own_word_measures_csv,
@@ -2803,14 +2831,14 @@ def main():
         with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
             item_colnames_markdown = read_item_col_names()
             st.markdown(item_colnames_markdown)
-        multi_file_tab.dataframe(st.session_state["trials_df"], use_container_width=True, height=200)
         if in_st_nn("subjects_summary_df_multi_asc"):
             multi_file_tab.markdown("### Subject level summary statistics")
             with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
                 subject_measure_colnames_markdown = read_subject_meas_col_names()
                 st.markdown(subject_measure_colnames_markdown)
             multi_file_tab.dataframe(
-                st.session_state["subjects_summary_df_multi_asc"], use_container_width=True, height=200
             )
         if in_st_nn("trials_summary_df_multi_asc"):
             multi_file_tab.markdown("### Trial level summary statistics")
@@ -2818,14 +2846,14 @@ def main():
                 trials_colnames_markdown = read_trial_col_names()
                 st.markdown(trials_colnames_markdown)
             multi_file_tab.dataframe(
-                st.session_state["trials_summary_df_multi_asc"], use_container_width=True, height=200
             )
         multi_file_tab.markdown("### Combined fixations dataframe and fixation level features")
         with multi_file_tab.popover("Column name definitions"):
             fix_colnames_markdown = get_fix_colnames_markdown()
             st.markdown(fix_colnames_markdown)
-        multi_file_tab.dataframe(st.session_state["all_fix_dfs_concat_multi_asc"], use_container_width=True, height=200)
         high_fix_count_dfs = []
         for algo_choice in st.session_state["algo_choice_multi_asc"]:
@@ -2855,26 +2883,26 @@ def main():
             multi_file_tab.markdown(
                 "### Words that had a large number of fixations assigned to them and may need to be investigated"
             )
-            multi_file_tab.dataframe(high_fix_count_dfs_cat, use_container_width=True, height=200)
             subs_str = "-".join([s for s in st.session_state["all_trials_by_subj"].keys()])
             high_fix_count_dfs_cat.to_csv(RESULTS_FOLDER / f"{subs_str}_words_with_many_fixations.csv")
         if "all_correction_stats" in st.session_state:
             multi_file_tab.markdown("### Correction statistics")
-            multi_file_tab.dataframe(st.session_state["all_correction_stats"], use_container_width=True, height=200)
         multi_file_tab.markdown("### Combined saccades dataframe and saccade level features")
         with multi_file_tab.popover("Column name definitions"):
             sac_colnames_markdown = get_sac_colnames_markdown()
             st.markdown(sac_colnames_markdown)
         multi_file_tab.dataframe(
-            st.session_state["all_sacc_dfs_concat_multi_asc"], use_container_width=True, height=200
         )
         multi_file_tab.markdown("### Combined characters dataframe")
         with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
             chars_colnames_markdown = read_chars_col_names()
             st.markdown(chars_colnames_markdown)
         multi_file_tab.dataframe(
-            st.session_state["all_chars_dfs_concat_multi_asc"], use_container_width=True, height=200
         )
         if not st.session_state["all_own_word_measures_concat"].empty:
@@ -2883,7 +2911,7 @@ def main():
                 word_measure_colnames_markdown = read_word_meas_col_names()
                 st.markdown(word_measure_colnames_markdown)
             multi_file_tab.dataframe(
-                st.session_state["all_own_word_measures_concat"], use_container_width=True, height=200
             )
         if not st.session_state["all_sentence_dfs_concat_multi_asc"].empty:
             multi_file_tab.markdown("### Combined sentence dataframe and sentence level features")
@@ -2891,7 +2919,7 @@ def main():
                 sentence_measure_colnames_markdown = read_sent_meas_col_names()
                 st.markdown(sentence_measure_colnames_markdown)
             multi_file_tab.dataframe(
-                st.session_state["all_sentence_dfs_concat_multi_asc"], use_container_width=True, height=200
             )
     if "zipfiles_with_results" in st.session_state:
         multi_res_col1, multi_res_col2 = multi_file_tab.columns(2)
@@ -2985,10 +3013,10 @@ def main():
                         to_plot_list=plotting_checkboxes_multi,
                         font=selecte_plotting_font_multi_asc,
                     ),
-                    use_container_width=True,
                 )
                 plot_expander_multi.plotly_chart(
-                    plot_y_corr(dffix_multi, st.session_state["algo_choice_multi_asc"]), use_container_width=True
                 )
                 select_and_show_fix_sacc_feature_plots(
@@ -3060,7 +3088,7 @@ def main():
                         fixations_tuples, textblock_input_dict, trial=trial_multi, get_char_measures=False
                     )
-                    st.dataframe(word_measures_df, use_container_width=True, hide_index=True, height=200)
                     word_measures_df_csv = convert_df(word_measures_df)
                     st.download_button(
@@ -3082,7 +3110,7 @@ def main():
                     st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
                     if character_measures_df is not None:
-                        st.dataframe(character_measures_df, use_container_width=True, hide_index=True, height=200)
                 with own_analysis_tab:
                     st.markdown(
@@ -3114,7 +3142,7 @@ def main():
                     own_word_measures = reorder_columns(own_word_measures)
                     if "question_correct" in own_word_measures.columns:
                         own_word_measures = own_word_measures.drop(columns=["question_correct"])
-                    st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
                     own_word_measures_csv = convert_df(own_word_measures)
                     st.download_button(
                         "⏬ Download word measures data",
@@ -3142,7 +3170,7 @@ def main():
                     )
                     st.pyplot(own_word_measures_fig)
                     st.markdown("Sentence measures")
-                    st.dataframe(sent_measures_multi, use_container_width=True, hide_index=True, height=200)
             else:
                 multi_file_tab.warning("🚨 Stimulus information needed for analysis 🚨")
@@ -3685,7 +3713,7 @@ def select_and_show_fix_sacc_feature_plots(
                 x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
                 label_start="Fixation",
             ),
-            use_container_width=True,
         )
         sacc_feature_plot_col_single_asc.plotly_chart(
             plot_fix_measure(
@@ -3694,7 +3722,7 @@ def select_and_show_fix_sacc_feature_plots(
                 x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
                 label_start="Saccade",
             ),
-            use_container_width=True,
         )

             events_df_expander_single.markdown("### Fixations")
             events_df_expander_single.dataframe(
                 events_df[events_df["msg"] == "FIX"].dropna(how="all", axis=1).copy(),
+                width='stretch',
                 height=200,
             )
             events_df_expander_single.markdown("### Saccades")
             events_df_expander_single.dataframe(
                 events_df[events_df["msg"] == "SAC"].dropna(how="all", axis=1).copy(),
+                width='stretch',
                 height=200,
             )
             if not events_df[events_df["msg"] == "BLINK"].empty:
                 blinksdf = blinksdf.drop(
                     columns=[c for c in blinksdf.columns if c in ["blink", "blink_after", "blink_before"]]
                 )
+                events_df_expander_single.dataframe(blinksdf, width='stretch', height=200)
             show_cleaning_options(single_file_tab_asc_tab, events_df[events_df["msg"] == "FIX"], "single_asc")
         if "dffix_cleaned_single_asc" in st.session_state and "trial_single_asc" in st.session_state:
                     chars_colnames_markdown = read_chars_col_names()
                     st.markdown(chars_colnames_markdown)
                 df_stim_expander_single.dataframe(
+                    pd.DataFrame(trial["chars_list"]), width='stretch', height=200
                 )
                 if "words_list" in trial:
                     df_stim_expander_single.markdown("### Words dataframe")
                     df_stim_expander_single.dataframe(
+                        pd.DataFrame(trial["words_list"]), width='stretch', height=200
                     )
             else:
                 st.warning("🚨 No stimulus information in session state")
                         font=selected_plotting_font_single_asc,
                         lines_in_plot=lines_in_plot_single_asc,
                     ),
+                    width='content',
                 )
                 plot_expander_single.markdown("#### Saccades")
                         font=selected_plotting_font_single_asc,
                         lines_in_plot=lines_in_plot_single_asc,
                     ),
+                    width='content',
                 )
                 plot_expander_single.markdown("#### Y-coordinate correction due to line-assignment")
                 plot_expander_single.plotly_chart(
+                    plot_y_corr(dffix, st.session_state["algo_choice_single_asc"]), width='stretch'
                 )
             if "average_y_corrections" in trial:
                 plot_expander_single.markdown(
                             with open("word_measures.md", "r") as f:
                                 word_measure_colnames_markdown = "\n".join(f.readlines())
                             st.markdown(word_measure_colnames_markdown)
+                        st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
                         own_word_measures_csv = convert_df(own_word_measures)
                         subject = st.session_state["trial_single_asc"]["subject"]
                         trial_id = st.session_state["trial_single_asc"]["trial_id"]
                             st.markdown(sentence_measure_colnames_markdown)
                         st.dataframe(
                             st.session_state["own_sent_measures_single_asc"],
+                            width='stretch',
                             hide_index=True,
                             height=200,
                         )
                                 get_char_measures=False,
                             )
+                            st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
                             word_measures_df_csv = convert_df(word_measures_df)
                             st.download_button(
                             if character_measures_df is not None:
                                 st.dataframe(
+                                    character_measures_df, width='stretch', hide_index=True, height=200
                                 )
             else:
                 single_file_tab_asc_tab.warning("🚨 Stimulus information needed for analysis 🚨")
             accept_multiple_files=False,
             key="single_csv_file_stim_uploaded",
             type={"json", "csv", "txt", "dat","jpeg","png"},
+            help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes, note the filename MUST match the trial name for images(so for trial t1 the image MUST be called t1.png). This can be left blank if you chose to use the examples.",
         )
         st.checkbox(
         ]:
             if k in st.session_state:
                 del st.session_state[k]
+        def stringify_column_keep_decimal_if_needed(series):
+            try:
+                # Try to convert all values to float, then int
+                floats = series.astype(float)
+                ints = floats.astype(int)
+                # If all values are equal as int and float, drop decimals
+                if (floats == ints).all():
+                    return ints.astype(str)
+                else:
+                    return floats.astype(str)
+            except Exception:
+                # If conversion fails, fallback to original as string
+                return series.astype(str)
         if use_example_or_uploaded_file_choice != "Example Files":
             st.session_state["dffix_single_csv"] = load_csv_delim_agnostic(single_csv_file)
             st.session_state["dffix_col_mappings_guess_single_csv"] = find_col_name_suggestions(
             elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
                 stimdf_single_csv = recognize_text(single_csv_stim_file)
                 stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
+                if 'trial_id' in stimdf_single_csv.columns:
+                    stimdf_single_csv['trial_id'] = stimdf_single_csv['trial_id'].astype(str)
                 st.session_state["stimdf_single_csv"] = stimdf_single_csv
                 colnames_stim = st.session_state["stimdf_single_csv"].columns
             else:
             st.session_state["chars_df_col_mappings_guess_single_csv"] = find_col_name_suggestions(
                 list(colnames_stim), COLNAMES_CUSTOM_CSV_STIM
             )
+            for colname in COLNAME_CANDIDATES_CUSTOM_CSV_FIX['trial_id_col_name_fix']:
+                if colname in st.session_state["dffix_single_csv"].columns:
+                    st.session_state["dffix_single_csv"][colname] = stringify_column_keep_decimal_if_needed(st.session_state["dffix_single_csv"][colname])
         else:
             with open(EXAMPLE_CUSTOM_JSON_FILE, "r") as json_file:
                 json_string = json_file.read()
             if in_st_nn("dffix_single_csv"):
                 st.dataframe(
                     st.session_state["dffix_single_csv"],
+                    width='stretch',
                     hide_index=True,
                     on_select="ignore",
                     height=200,
                 else:
                     st.dataframe(
                         st.session_state["stimdf_single_csv"],
+                        width='stretch',
                         hide_index=True,
                         on_select="ignore",
                         height=200,
             help="This downloads the corrected fixations dataframe as a .csv file with the filename containing the trial id.",
         )
         with single_file_tab_csv_tab.expander("Show corrected fixation data", expanded=True):
+            st.dataframe(dffix, width='stretch', hide_index=True, height=200)
         with single_file_tab_csv_tab.expander("Show fixation plots", expanded=True):
             plotting_checkboxes_single_single_csv = st.multiselect(
                     to_plot_list=plotting_checkboxes_single_single_csv,
                     algo_choice=st.session_state["algo_choice_single_csv"],
                 ),
+                width='stretch',
             )
+            st.plotly_chart(plot_y_corr(dffix, st.session_state["algo_choice_single_csv"]), width='stretch')
             plotlist = [x for x in dffix.columns if "Unnamed" not in str(x)]
             plot_choice = st.multiselect(
                 "Which measures should be visualized?",
                 key="plot_choice_fix_measure",
                 default=plotlist[-1],
             )
+            st.plotly_chart(plot_fix_measure(dffix, plot_choice, "Index"), width='stretch')
         if "chars_list" in trial:
             analysis_expander_custom = single_file_tab_csv_tab.expander("Show Analysis results", True)
                     fixations_tuples, textblock_input_dict, trial=trial, get_char_measures=False
                 )
+                st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
                 word_measures_df_csv = convert_df(word_measures_df)
                 st.download_button(
                 st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
                 if character_measures_df is not None:
+                    st.dataframe(character_measures_df, width='stretch', hide_index=True, height=200)
             with own_analysis_tab_custom:
                 st.markdown(
                     save_to_csv=True,
                     measures_to_calculate = ALL_MEASURES_OWN
                 )
+                st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
                 own_word_measures_csv = convert_df(own_word_measures)
+                sent_measures_single_csv = compute_sentence_measures(
+                    dffix,
+                    pd.DataFrame(trial["chars_df"]),
+                    st.session_state["algo_choice_custom_eyekit"],
+                    ALL_SENT_MEASURES,
+                    save_to_csv=True,
+                )
+                st.session_state["own_sent_measures_single_csv"] = sent_measures_single_csv
+                st.dataframe(sent_measures_single_csv, width='stretch', hide_index=True, height=200)
                 st.download_button(
                     "⏬ Download word measures data",
                     own_word_measures_csv,
         with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
             item_colnames_markdown = read_item_col_names()
             st.markdown(item_colnames_markdown)
+        multi_file_tab.dataframe(st.session_state["trials_df"], width='stretch', height=200)
         if in_st_nn("subjects_summary_df_multi_asc"):
             multi_file_tab.markdown("### Subject level summary statistics")
             with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
                 subject_measure_colnames_markdown = read_subject_meas_col_names()
                 st.markdown(subject_measure_colnames_markdown)
             multi_file_tab.dataframe(
+                st.session_state["subjects_summary_df_multi_asc"], width='stretch', height=200
             )
         if in_st_nn("trials_summary_df_multi_asc"):
             multi_file_tab.markdown("### Trial level summary statistics")
                 trials_colnames_markdown = read_trial_col_names()
                 st.markdown(trials_colnames_markdown)
             multi_file_tab.dataframe(
+                st.session_state["trials_summary_df_multi_asc"], width='stretch', height=200
             )
         multi_file_tab.markdown("### Combined fixations dataframe and fixation level features")
         with multi_file_tab.popover("Column name definitions"):
             fix_colnames_markdown = get_fix_colnames_markdown()
             st.markdown(fix_colnames_markdown)
+        multi_file_tab.dataframe(st.session_state["all_fix_dfs_concat_multi_asc"], width='stretch', height=200)
         high_fix_count_dfs = []
         for algo_choice in st.session_state["algo_choice_multi_asc"]:
             multi_file_tab.markdown(
                 "### Words that had a large number of fixations assigned to them and may need to be investigated"
             )
+            multi_file_tab.dataframe(high_fix_count_dfs_cat, width='stretch', height=200)
             subs_str = "-".join([s for s in st.session_state["all_trials_by_subj"].keys()])
             high_fix_count_dfs_cat.to_csv(RESULTS_FOLDER / f"{subs_str}_words_with_many_fixations.csv")
         if "all_correction_stats" in st.session_state:
             multi_file_tab.markdown("### Correction statistics")
+            multi_file_tab.dataframe(st.session_state["all_correction_stats"], width='stretch', height=200)
         multi_file_tab.markdown("### Combined saccades dataframe and saccade level features")
         with multi_file_tab.popover("Column name definitions"):
             sac_colnames_markdown = get_sac_colnames_markdown()
             st.markdown(sac_colnames_markdown)
         multi_file_tab.dataframe(
+            st.session_state["all_sacc_dfs_concat_multi_asc"], width='stretch', height=200
         )
         multi_file_tab.markdown("### Combined characters dataframe")
         with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
             chars_colnames_markdown = read_chars_col_names()
             st.markdown(chars_colnames_markdown)
         multi_file_tab.dataframe(
+            st.session_state["all_chars_dfs_concat_multi_asc"], width='stretch', height=200
         )
         if not st.session_state["all_own_word_measures_concat"].empty:
                 word_measure_colnames_markdown = read_word_meas_col_names()
                 st.markdown(word_measure_colnames_markdown)
             multi_file_tab.dataframe(
+                st.session_state["all_own_word_measures_concat"], width='stretch', height=200
             )
         if not st.session_state["all_sentence_dfs_concat_multi_asc"].empty:
             multi_file_tab.markdown("### Combined sentence dataframe and sentence level features")
                 sentence_measure_colnames_markdown = read_sent_meas_col_names()
                 st.markdown(sentence_measure_colnames_markdown)
             multi_file_tab.dataframe(
+                st.session_state["all_sentence_dfs_concat_multi_asc"], width='stretch', height=200
             )
     if "zipfiles_with_results" in st.session_state:
         multi_res_col1, multi_res_col2 = multi_file_tab.columns(2)
                         to_plot_list=plotting_checkboxes_multi,
                         font=selecte_plotting_font_multi_asc,
                     ),
+                    width='stretch',
                 )
                 plot_expander_multi.plotly_chart(
+                    plot_y_corr(dffix_multi, st.session_state["algo_choice_multi_asc"]), width='stretch'
                 )
                 select_and_show_fix_sacc_feature_plots(
                         fixations_tuples, textblock_input_dict, trial=trial_multi, get_char_measures=False
                     )
+                    st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
                     word_measures_df_csv = convert_df(word_measures_df)
                     st.download_button(
                     st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
                     if character_measures_df is not None:
+                        st.dataframe(character_measures_df, width='stretch', hide_index=True, height=200)
                 with own_analysis_tab:
                     st.markdown(
                     own_word_measures = reorder_columns(own_word_measures)
                     if "question_correct" in own_word_measures.columns:
                         own_word_measures = own_word_measures.drop(columns=["question_correct"])
+                    st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
                     own_word_measures_csv = convert_df(own_word_measures)
                     st.download_button(
                         "⏬ Download word measures data",
                     )
                     st.pyplot(own_word_measures_fig)
                     st.markdown("Sentence measures")
+                    st.dataframe(sent_measures_multi, width='stretch', hide_index=True, height=200)
             else:
                 multi_file_tab.warning("🚨 Stimulus information needed for analysis 🚨")
                 x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
                 label_start="Fixation",
             ),
+            width='stretch',
         )
         sacc_feature_plot_col_single_asc.plotly_chart(
             plot_fix_measure(
                 x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
                 label_start="Saccade",
             ),
+            width='stretch',
         )

multi_proc_funcs.py CHANGED Viewed

@@ -267,7 +267,7 @@ def set_up_models(dist_models_folder):
     out_dict = {}
     dist_models_with_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_True*.ckpt"))
     dist_models_without_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_False*.ckpt"))
-    DIST_MODEL_DATE_WITH_NORM = dist_models_with_norm[0].stem.split("_")[1]
     models_without_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_without_norm]
     models_with_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_with_norm]
@@ -277,7 +277,10 @@ def set_up_models(dist_models_folder):
     models_without_norm_df = [x[0] for x in models_without_norm_df if x[0] is not None]
     models_with_norm_df = [x[0] for x in models_with_norm_df if x[0] is not None]
     ensemble_model_avg = models.EnsembleModel(
         models_without_norm_df, models_with_norm_df, learning_rate=0.0058, use_simple_average=True
     )
@@ -286,9 +289,11 @@ def set_up_models(dist_models_folder):
     out_dict["model_cfg_without_norm_df"] = model_cfg_without_norm_df
     out_dict["model_cfg_with_norm_df"] = model_cfg_with_norm_df
-    single_DIST_model, single_DIST_model_cfg = find_and_load_model(model_date=DIST_MODEL_DATE_WITH_NORM)
     out_dict["single_DIST_model"] = single_DIST_model
     out_dict["single_DIST_model_cfg"] = single_DIST_model_cfg
     return out_dict

     out_dict = {}
     dist_models_with_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_True*.ckpt"))
     dist_models_without_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_False*.ckpt"))
+    dist_model_date_with_norm = dist_models_with_norm[0].stem.split("_")[1]
     models_without_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_without_norm]
     models_with_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_with_norm]
     models_without_norm_df = [x[0] for x in models_without_norm_df if x[0] is not None]
     models_with_norm_df = [x[0] for x in models_with_norm_df if x[0] is not None]
+    if len(models_without_norm_df) == 0:
+        ic("No models without normalization found")
+    if len(models_with_norm_df) == 0:
+        ic("No models with normalization found")
     ensemble_model_avg = models.EnsembleModel(
         models_without_norm_df, models_with_norm_df, learning_rate=0.0058, use_simple_average=True
     )
     out_dict["model_cfg_without_norm_df"] = model_cfg_without_norm_df
     out_dict["model_cfg_with_norm_df"] = model_cfg_with_norm_df
+    single_DIST_model, single_DIST_model_cfg = find_and_load_model(model_date=dist_model_date_with_norm)
     out_dict["single_DIST_model"] = single_DIST_model
     out_dict["single_DIST_model_cfg"] = single_DIST_model_cfg
+    if single_DIST_model is None:
+        ic(f"Failed to load single DIST model for {dist_model_date_with_norm}")
     return out_dict

popEye_funcs.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Mostly adapted from: https://github.com/sascha2schroeder/popEye
 """
 import numpy as np
 import pandas as pd
 from icecream import ic
@@ -886,11 +887,13 @@ def combine_words(fix, wordfirst, wordtmp, algo_choice, measures_to_calculate):
 def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_to_csv=False):
-    sentitem = stimmat.drop_duplicates(
         subset="in_sentence_number", keep="first"
     )  # TODO check why there are rows with sent number None
     fixin = fix.copy().reset_index(drop=True)
     fixin["on_sentence_num2"] = fixin[f"on_sentence_num_{algo_choice}"].copy()
     # Recompute sentence number (two fixation exception rule)
@@ -1146,6 +1149,7 @@ def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_
         item.rename({"in_sentence_number": f"on_sentence_num_{algo_choice}"}, axis=1),
         on=f"on_sentence_num_{algo_choice}",
         how="left",
     )
     sent[f"skip_{algo_choice}"] = 0
     sent.loc[pd.isna(sent[f"nrun_{algo_choice}"]), f"skip_{algo_choice}"] = 1

 Mostly adapted from: https://github.com/sascha2schroeder/popEye
 """
+from copy import deepcopy
 import numpy as np
 import pandas as pd
 from icecream import ic
 def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_to_csv=False):
+    sentitem = deepcopy(stimmat).drop_duplicates(
         subset="in_sentence_number", keep="first"
     )  # TODO check why there are rows with sent number None
     fixin = fix.copy().reset_index(drop=True)
+    unique_trial_ids = fixin['trial_id'].unique()
+    if len(unique_trial_ids) == 1:
+        sentitem.loc[:,'trial_id'] = unique_trial_ids[0]
     fixin["on_sentence_num2"] = fixin[f"on_sentence_num_{algo_choice}"].copy()
     # Recompute sentence number (two fixation exception rule)
         item.rename({"in_sentence_number": f"on_sentence_num_{algo_choice}"}, axis=1),
         on=f"on_sentence_num_{algo_choice}",
         how="left",
+        suffixes=['','from_item']
     )
     sent[f"skip_{algo_choice}"] = 0
     sent.loc[pd.isna(sent[f"nrun_{algo_choice}"]), f"skip_{algo_choice}"] = 1

process_asc_files_in_multi_p.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 import json
 from sys import platform as _platform
 from functools import partial
 import multiprocessing
@@ -10,6 +11,17 @@ import sys
 import pandas as pd
 def get_cpu_count():
     if os.sys.platform in ("linux", "linux2", "darwin"):
         return os.cpu_count()
@@ -103,6 +115,7 @@ def main():
             models_dict,
             fix_cols_to_add_multi_asc,
         ) = json.loads(input_data)
         if (
             "DIST" in algo_choice
             or "Wisdom_of_Crowds_with_DIST" in algo_choice
@@ -133,16 +146,21 @@ def main():
             models_dict,
             fix_cols_to_add_multi_asc,
         )
         out2 = []
         for dffix, trial in out:
             dffix = dffix.to_dict("records")
             trial = make_json_compatible(trial)
             out2.append((dffix, trial))
         json_data_out = json.dumps(out2)
         sys.stdout.flush()
         print(json_data_out)
     except Exception as e:
         print(json.dumps({"error": str(e)}))
 if __name__ == "__main__":

 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 import json
+import logging
 from sys import platform as _platform
 from functools import partial
 import multiprocessing
 import pandas as pd
+logger = logging.getLogger('multiproc')
+logger.propagate = False
+logger.setLevel("DEBUG")
+ch = logging.FileHandler("MULTI_LOG.log", "a")
+ch.setFormatter(
+    logging.Formatter(
+        "%(asctime)s-{%(filename)s:%(lineno)d}-%(levelname)s >>> %(message)s",
+        "%m-%d %H:%M:%S",
+    )
+)
+logger.addHandler(ch)
 def get_cpu_count():
     if os.sys.platform in ("linux", "linux2", "darwin"):
         return os.cpu_count()
             models_dict,
             fix_cols_to_add_multi_asc,
         ) = json.loads(input_data)
+        logger.info("Finished loading from json")
         if (
             "DIST" in algo_choice
             or "Wisdom_of_Crowds_with_DIST" in algo_choice
             models_dict,
             fix_cols_to_add_multi_asc,
         )
+        logger.info("Finished process_asc_files_in_multi_proc")
         out2 = []
         for dffix, trial in out:
             dffix = dffix.to_dict("records")
             trial = make_json_compatible(trial)
             out2.append((dffix, trial))
         json_data_out = json.dumps(out2)
+        logger.info("Finished appending")
         sys.stdout.flush()
         print(json_data_out)
     except Exception as e:
+        logger.warning(e)
         print(json.dumps({"error": str(e)}))
+        with open("MULTI_ERROR.log",'w') as f:
+            f.write(e)
 if __name__ == "__main__":

utils.py CHANGED Viewed

@@ -198,6 +198,7 @@ def calc_xdiff_ydiff(line_xcoords_no_pad, line_ycoords_no_pad, line_heights, all
 def add_words(chars_list):
     chars_list_reconstructed = []
     words_list = []
     sentence_list = []
@@ -210,6 +211,21 @@ def add_words(chars_list):
     on_line_num = -1
     line_change_on_next_char = False
     num_chars = len(chars_list)
     for idx, char_dict in enumerate(chars_list):
         # check if line change will happen after current char
         on_line_num = char_dict["assigned_line"]
@@ -222,23 +238,18 @@ def add_words(chars_list):
             word_xmin = chars_list_reconstructed[word_start_idx]["char_xmin"]
             if chars_list_reconstructed[-1]["char"] == " " and len(chars_list_reconstructed) != 1:
                 word_xmax = chars_list_reconstructed[-2]["char_xmax"]
-                word = "".join(
-                    [
-                        chars_list_reconstructed[idx]["char"]
-                        for idx in range(word_start_idx, len(chars_list_reconstructed) - 1)
-                    ]
-                )
             elif len(chars_list_reconstructed) == 1:
                 word_xmax = chars_list_reconstructed[-1]["char_xmax"]
                 word = " "
             else:
-                word = "".join(
-                    [
-                        chars_list_reconstructed[idx]["char"]
-                        for idx in range(word_start_idx, len(chars_list_reconstructed))
-                    ]
-                )
                 word_xmax = chars_list_reconstructed[-1]["char_xmax"]
             word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
             word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
@@ -269,10 +280,9 @@ def add_words(chars_list):
                     char_dict["in_word_number"] = len(words_list) - 1
                     char_dict["in_word"] = word
                     char_dict["num_letters_from_start_of_word"] = cidx
             word_start_idx = idx + 1
-        if chars_list_reconstructed[-1]["char"] in [".", "!", "?"] or idx == (len(chars_list) - 1):
             if idx != sentence_start_idx:
                 chars_df_temp = pd.DataFrame(chars_list_reconstructed[sentence_start_idx:])
                 line_texts = []
@@ -301,7 +311,6 @@ def add_words(chars_list):
         ):
             char_dict["in_word_number"] = chars_list_reconstructed[cidx + 1]["in_word_number"]
             char_dict["in_word"] = chars_list_reconstructed[cidx + 1]["in_word"]
     last_letter_in_word = words_list[-1]["word"][-1]
     last_letter_in_chars_list_reconstructed = char_dict["char"]
     if last_letter_in_word != last_letter_in_chars_list_reconstructed:
@@ -316,7 +325,6 @@ def add_words(chars_list):
                 word_ymax=words_list[-1]["word_ymax"],
                 assigned_line=assigned_line,
             )
             word_x_center = round(
                 (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
             )
@@ -347,7 +355,24 @@ def add_words(chars_list):
             chars_list_reconstructed[-1]["in_sentence"] = sentence_list[-1]["sentence_text"]
         else:
             ic(f"Warning Sentence list empty: {sentence_list}")
     return words_list, chars_list_reconstructed
@@ -1520,6 +1545,7 @@ def download_example_ascs(EXAMPLES_FOLDER, EXAMPLES_ASC_ZIP_FILENAME, OSF_DOWNLA
                 ic(f"Extracting {EXAMPLES_ASC_ZIP_FILENAME} failed")
         EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
     else:
         EXAMPLE_ASC_FILES = []
     return EXAMPLE_ASC_FILES

 def add_words(chars_list):
     chars_list_reconstructed = []
     words_list = []
     sentence_list = []
     on_line_num = -1
     line_change_on_next_char = False
     num_chars = len(chars_list)
+    # Helper: is this char a sentence boundary?
+    def is_sentence_boundary(idx):
+        c = chars_list_reconstructed[idx]["char"]
+        if c not in ".!?":
+            return False
+        # Check if previous and next chars are digits (for numbers like 41.2)
+        prev_c = chars_list_reconstructed[idx-1]["char"] if idx > 0 else None
+        if prev_c == chars_list[idx-1]["char"]:
+            next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
+        if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
+            return False
+        # Optionally: only treat as boundary if followed by space or end
+        if next_c and next_c not in [None, " ", "\n"]:
+            return False
+        return True
     for idx, char_dict in enumerate(chars_list):
         # check if line change will happen after current char
         on_line_num = char_dict["assigned_line"]
             word_xmin = chars_list_reconstructed[word_start_idx]["char_xmin"]
             if chars_list_reconstructed[-1]["char"] == " " and len(chars_list_reconstructed) != 1:
                 word_xmax = chars_list_reconstructed[-2]["char_xmax"]
+                word = "".join([
+                    chars_list_reconstructed[idx]["char"]
+                    for idx in range(word_start_idx, len(chars_list_reconstructed) - 1)
+                ])
             elif len(chars_list_reconstructed) == 1:
                 word_xmax = chars_list_reconstructed[-1]["char_xmax"]
                 word = " "
             else:
+                word = "".join([
+                    chars_list_reconstructed[idx]["char"]
+                    for idx in range(word_start_idx, len(chars_list_reconstructed))
+                ])
                 word_xmax = chars_list_reconstructed[-1]["char_xmax"]
             word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
             word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
                     char_dict["in_word_number"] = len(words_list) - 1
                     char_dict["in_word"] = word
                     char_dict["num_letters_from_start_of_word"] = cidx
             word_start_idx = idx + 1
+        # Improved sentence boundary detection
+        if is_sentence_boundary(len(chars_list_reconstructed)-1) or idx == (len(chars_list) - 1):
             if idx != sentence_start_idx:
                 chars_df_temp = pd.DataFrame(chars_list_reconstructed[sentence_start_idx:])
                 line_texts = []
         ):
             char_dict["in_word_number"] = chars_list_reconstructed[cidx + 1]["in_word_number"]
             char_dict["in_word"] = chars_list_reconstructed[cidx + 1]["in_word"]
     last_letter_in_word = words_list[-1]["word"][-1]
     last_letter_in_chars_list_reconstructed = char_dict["char"]
     if last_letter_in_word != last_letter_in_chars_list_reconstructed:
                 word_ymax=words_list[-1]["word_ymax"],
                 assigned_line=assigned_line,
             )
             word_x_center = round(
                 (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
             )
             chars_list_reconstructed[-1]["in_sentence"] = sentence_list[-1]["sentence_text"]
         else:
             ic(f"Warning Sentence list empty: {sentence_list}")
+    # Add 'in_sentence' and 'in_sentence_number' keys to words_list using 'in_word_number' from chars_list_reconstructed
+    # Build a mapping from in_word_number to (in_sentence, in_sentence_number)
+    word_to_sentence = {}
+    word_to_sentence_number = {}
+    for char in chars_list_reconstructed:
+        if "in_word_number" in char and "in_sentence" in char and "in_sentence_number" in char:
+            word_to_sentence[char["in_word_number"]] = char["in_sentence"]
+            word_to_sentence_number[char["in_word_number"]] = char["in_sentence_number"]
+    for word in words_list:
+        word_num = word.get("word_number")
+        if word_num in word_to_sentence:
+            word["in_sentence"] = word_to_sentence[word_num]
+        else:
+            word["in_sentence"] = None
+        if word_num in word_to_sentence_number:
+            word["in_sentence_number"] = word_to_sentence_number[word_num]
+        else:
+            word["in_sentence_number"] = None
     return words_list, chars_list_reconstructed
                 ic(f"Extracting {EXAMPLES_ASC_ZIP_FILENAME} failed")
         EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
     else:
         EXAMPLE_ASC_FILES = []
     return EXAMPLE_ASC_FILES