hugpv commited on
Commit
7af3e73
·
1 Parent(s): 5f3a1bb

synced with dev version mainly sent measure related

Browse files
Files changed (5) hide show
  1. app.py +69 -41
  2. multi_proc_funcs.py +8 -3
  3. popEye_funcs.py +6 -2
  4. process_asc_files_in_multi_p.py +18 -0
  5. utils.py +45 -19
app.py CHANGED
@@ -1483,13 +1483,13 @@ def main():
1483
  events_df_expander_single.markdown("### Fixations")
1484
  events_df_expander_single.dataframe(
1485
  events_df[events_df["msg"] == "FIX"].dropna(how="all", axis=1).copy(),
1486
- use_container_width=True,
1487
  height=200,
1488
  )
1489
  events_df_expander_single.markdown("### Saccades")
1490
  events_df_expander_single.dataframe(
1491
  events_df[events_df["msg"] == "SAC"].dropna(how="all", axis=1).copy(),
1492
- use_container_width=True,
1493
  height=200,
1494
  )
1495
  if not events_df[events_df["msg"] == "BLINK"].empty:
@@ -1498,7 +1498,7 @@ def main():
1498
  blinksdf = blinksdf.drop(
1499
  columns=[c for c in blinksdf.columns if c in ["blink", "blink_after", "blink_before"]]
1500
  )
1501
- events_df_expander_single.dataframe(blinksdf, use_container_width=True, height=200)
1502
  show_cleaning_options(single_file_tab_asc_tab, events_df[events_df["msg"] == "FIX"], "single_asc")
1503
 
1504
  if "dffix_cleaned_single_asc" in st.session_state and "trial_single_asc" in st.session_state:
@@ -1589,12 +1589,12 @@ def main():
1589
  chars_colnames_markdown = read_chars_col_names()
1590
  st.markdown(chars_colnames_markdown)
1591
  df_stim_expander_single.dataframe(
1592
- pd.DataFrame(trial["chars_list"]), use_container_width=True, height=200
1593
  )
1594
  if "words_list" in trial:
1595
  df_stim_expander_single.markdown("### Words dataframe")
1596
  df_stim_expander_single.dataframe(
1597
- pd.DataFrame(trial["words_list"]), use_container_width=True, height=200
1598
  )
1599
  else:
1600
  st.warning("🚨 No stimulus information in session state")
@@ -1682,7 +1682,7 @@ def main():
1682
  font=selected_plotting_font_single_asc,
1683
  lines_in_plot=lines_in_plot_single_asc,
1684
  ),
1685
- use_container_width=False,
1686
  )
1687
  plot_expander_single.markdown("#### Saccades")
1688
 
@@ -1712,11 +1712,11 @@ def main():
1712
  font=selected_plotting_font_single_asc,
1713
  lines_in_plot=lines_in_plot_single_asc,
1714
  ),
1715
- use_container_width=False,
1716
  )
1717
  plot_expander_single.markdown("#### Y-coordinate correction due to line-assignment")
1718
  plot_expander_single.plotly_chart(
1719
- plot_y_corr(dffix, st.session_state["algo_choice_single_asc"]), use_container_width=True
1720
  )
1721
  if "average_y_corrections" in trial:
1722
  plot_expander_single.markdown(
@@ -1822,7 +1822,7 @@ def main():
1822
  with open("word_measures.md", "r") as f:
1823
  word_measure_colnames_markdown = "\n".join(f.readlines())
1824
  st.markdown(word_measure_colnames_markdown)
1825
- st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
1826
  own_word_measures_csv = convert_df(own_word_measures)
1827
  subject = st.session_state["trial_single_asc"]["subject"]
1828
  trial_id = st.session_state["trial_single_asc"]["trial_id"]
@@ -1868,7 +1868,7 @@ def main():
1868
  st.markdown(sentence_measure_colnames_markdown)
1869
  st.dataframe(
1870
  st.session_state["own_sent_measures_single_asc"],
1871
- use_container_width=True,
1872
  hide_index=True,
1873
  height=200,
1874
  )
@@ -1951,7 +1951,7 @@ def main():
1951
  get_char_measures=False,
1952
  )
1953
 
1954
- st.dataframe(word_measures_df, use_container_width=True, hide_index=True, height=200)
1955
  word_measures_df_csv = convert_df(word_measures_df)
1956
 
1957
  st.download_button(
@@ -1975,7 +1975,7 @@ def main():
1975
 
1976
  if character_measures_df is not None:
1977
  st.dataframe(
1978
- character_measures_df, use_container_width=True, hide_index=True, height=200
1979
  )
1980
  else:
1981
  single_file_tab_asc_tab.warning("🚨 Stimulus information needed for analysis 🚨")
@@ -2006,7 +2006,7 @@ def main():
2006
  accept_multiple_files=False,
2007
  key="single_csv_file_stim_uploaded",
2008
  type={"json", "csv", "txt", "dat","jpeg","png"},
2009
- help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes. This can be left blank if you chose to use the examples.",
2010
  )
2011
 
2012
  st.checkbox(
@@ -2043,7 +2043,19 @@ def main():
2043
  ]:
2044
  if k in st.session_state:
2045
  del st.session_state[k]
2046
-
 
 
 
 
 
 
 
 
 
 
 
 
2047
  if use_example_or_uploaded_file_choice != "Example Files":
2048
  st.session_state["dffix_single_csv"] = load_csv_delim_agnostic(single_csv_file)
2049
  st.session_state["dffix_col_mappings_guess_single_csv"] = find_col_name_suggestions(
@@ -2063,6 +2075,8 @@ def main():
2063
  elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
2064
  stimdf_single_csv = recognize_text(single_csv_stim_file)
2065
  stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
 
 
2066
  st.session_state["stimdf_single_csv"] = stimdf_single_csv
2067
  colnames_stim = st.session_state["stimdf_single_csv"].columns
2068
  else:
@@ -2071,6 +2085,10 @@ def main():
2071
  st.session_state["chars_df_col_mappings_guess_single_csv"] = find_col_name_suggestions(
2072
  list(colnames_stim), COLNAMES_CUSTOM_CSV_STIM
2073
  )
 
 
 
 
2074
  else:
2075
  with open(EXAMPLE_CUSTOM_JSON_FILE, "r") as json_file:
2076
  json_string = json_file.read()
@@ -2086,7 +2104,7 @@ def main():
2086
  if in_st_nn("dffix_single_csv"):
2087
  st.dataframe(
2088
  st.session_state["dffix_single_csv"],
2089
- use_container_width=True,
2090
  hide_index=True,
2091
  on_select="ignore",
2092
  height=200,
@@ -2097,7 +2115,7 @@ def main():
2097
  else:
2098
  st.dataframe(
2099
  st.session_state["stimdf_single_csv"],
2100
- use_container_width=True,
2101
  hide_index=True,
2102
  on_select="ignore",
2103
  height=200,
@@ -2314,7 +2332,7 @@ def main():
2314
  help="This downloads the corrected fixations dataframe as a .csv file with the filename containing the trial id.",
2315
  )
2316
  with single_file_tab_csv_tab.expander("Show corrected fixation data", expanded=True):
2317
- st.dataframe(dffix, use_container_width=True, hide_index=True, height=200)
2318
  with single_file_tab_csv_tab.expander("Show fixation plots", expanded=True):
2319
 
2320
  plotting_checkboxes_single_single_csv = st.multiselect(
@@ -2332,9 +2350,9 @@ def main():
2332
  to_plot_list=plotting_checkboxes_single_single_csv,
2333
  algo_choice=st.session_state["algo_choice_single_csv"],
2334
  ),
2335
- use_container_width=True,
2336
  )
2337
- st.plotly_chart(plot_y_corr(dffix, st.session_state["algo_choice_single_csv"]), use_container_width=True)
2338
  plotlist = [x for x in dffix.columns if "Unnamed" not in str(x)]
2339
  plot_choice = st.multiselect(
2340
  "Which measures should be visualized?",
@@ -2342,7 +2360,7 @@ def main():
2342
  key="plot_choice_fix_measure",
2343
  default=plotlist[-1],
2344
  )
2345
- st.plotly_chart(plot_fix_measure(dffix, plot_choice, "Index"), use_container_width=True)
2346
 
2347
  if "chars_list" in trial:
2348
  analysis_expander_custom = single_file_tab_csv_tab.expander("Show Analysis results", True)
@@ -2419,7 +2437,7 @@ def main():
2419
  fixations_tuples, textblock_input_dict, trial=trial, get_char_measures=False
2420
  )
2421
 
2422
- st.dataframe(word_measures_df, use_container_width=True, hide_index=True, height=200)
2423
  word_measures_df_csv = convert_df(word_measures_df)
2424
 
2425
  st.download_button(
@@ -2435,7 +2453,7 @@ def main():
2435
  st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
2436
 
2437
  if character_measures_df is not None:
2438
- st.dataframe(character_measures_df, use_container_width=True, hide_index=True, height=200)
2439
 
2440
  with own_analysis_tab_custom:
2441
  st.markdown(
@@ -2450,9 +2468,19 @@ def main():
2450
  save_to_csv=True,
2451
  measures_to_calculate = ALL_MEASURES_OWN
2452
  )
2453
- st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
2454
  own_word_measures_csv = convert_df(own_word_measures)
2455
 
 
 
 
 
 
 
 
 
 
 
2456
  st.download_button(
2457
  "⏬ Download word measures data",
2458
  own_word_measures_csv,
@@ -2803,14 +2831,14 @@ def main():
2803
  with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
2804
  item_colnames_markdown = read_item_col_names()
2805
  st.markdown(item_colnames_markdown)
2806
- multi_file_tab.dataframe(st.session_state["trials_df"], use_container_width=True, height=200)
2807
  if in_st_nn("subjects_summary_df_multi_asc"):
2808
  multi_file_tab.markdown("### Subject level summary statistics")
2809
  with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
2810
  subject_measure_colnames_markdown = read_subject_meas_col_names()
2811
  st.markdown(subject_measure_colnames_markdown)
2812
  multi_file_tab.dataframe(
2813
- st.session_state["subjects_summary_df_multi_asc"], use_container_width=True, height=200
2814
  )
2815
  if in_st_nn("trials_summary_df_multi_asc"):
2816
  multi_file_tab.markdown("### Trial level summary statistics")
@@ -2818,14 +2846,14 @@ def main():
2818
  trials_colnames_markdown = read_trial_col_names()
2819
  st.markdown(trials_colnames_markdown)
2820
  multi_file_tab.dataframe(
2821
- st.session_state["trials_summary_df_multi_asc"], use_container_width=True, height=200
2822
  )
2823
 
2824
  multi_file_tab.markdown("### Combined fixations dataframe and fixation level features")
2825
  with multi_file_tab.popover("Column name definitions"):
2826
  fix_colnames_markdown = get_fix_colnames_markdown()
2827
  st.markdown(fix_colnames_markdown)
2828
- multi_file_tab.dataframe(st.session_state["all_fix_dfs_concat_multi_asc"], use_container_width=True, height=200)
2829
 
2830
  high_fix_count_dfs = []
2831
  for algo_choice in st.session_state["algo_choice_multi_asc"]:
@@ -2855,26 +2883,26 @@ def main():
2855
  multi_file_tab.markdown(
2856
  "### Words that had a large number of fixations assigned to them and may need to be investigated"
2857
  )
2858
- multi_file_tab.dataframe(high_fix_count_dfs_cat, use_container_width=True, height=200)
2859
  subs_str = "-".join([s for s in st.session_state["all_trials_by_subj"].keys()])
2860
  high_fix_count_dfs_cat.to_csv(RESULTS_FOLDER / f"{subs_str}_words_with_many_fixations.csv")
2861
 
2862
  if "all_correction_stats" in st.session_state:
2863
  multi_file_tab.markdown("### Correction statistics")
2864
- multi_file_tab.dataframe(st.session_state["all_correction_stats"], use_container_width=True, height=200)
2865
  multi_file_tab.markdown("### Combined saccades dataframe and saccade level features")
2866
  with multi_file_tab.popover("Column name definitions"):
2867
  sac_colnames_markdown = get_sac_colnames_markdown()
2868
  st.markdown(sac_colnames_markdown)
2869
  multi_file_tab.dataframe(
2870
- st.session_state["all_sacc_dfs_concat_multi_asc"], use_container_width=True, height=200
2871
  )
2872
  multi_file_tab.markdown("### Combined characters dataframe")
2873
  with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
2874
  chars_colnames_markdown = read_chars_col_names()
2875
  st.markdown(chars_colnames_markdown)
2876
  multi_file_tab.dataframe(
2877
- st.session_state["all_chars_dfs_concat_multi_asc"], use_container_width=True, height=200
2878
  )
2879
 
2880
  if not st.session_state["all_own_word_measures_concat"].empty:
@@ -2883,7 +2911,7 @@ def main():
2883
  word_measure_colnames_markdown = read_word_meas_col_names()
2884
  st.markdown(word_measure_colnames_markdown)
2885
  multi_file_tab.dataframe(
2886
- st.session_state["all_own_word_measures_concat"], use_container_width=True, height=200
2887
  )
2888
  if not st.session_state["all_sentence_dfs_concat_multi_asc"].empty:
2889
  multi_file_tab.markdown("### Combined sentence dataframe and sentence level features")
@@ -2891,7 +2919,7 @@ def main():
2891
  sentence_measure_colnames_markdown = read_sent_meas_col_names()
2892
  st.markdown(sentence_measure_colnames_markdown)
2893
  multi_file_tab.dataframe(
2894
- st.session_state["all_sentence_dfs_concat_multi_asc"], use_container_width=True, height=200
2895
  )
2896
  if "zipfiles_with_results" in st.session_state:
2897
  multi_res_col1, multi_res_col2 = multi_file_tab.columns(2)
@@ -2985,10 +3013,10 @@ def main():
2985
  to_plot_list=plotting_checkboxes_multi,
2986
  font=selecte_plotting_font_multi_asc,
2987
  ),
2988
- use_container_width=True,
2989
  )
2990
  plot_expander_multi.plotly_chart(
2991
- plot_y_corr(dffix_multi, st.session_state["algo_choice_multi_asc"]), use_container_width=True
2992
  )
2993
 
2994
  select_and_show_fix_sacc_feature_plots(
@@ -3060,7 +3088,7 @@ def main():
3060
  fixations_tuples, textblock_input_dict, trial=trial_multi, get_char_measures=False
3061
  )
3062
 
3063
- st.dataframe(word_measures_df, use_container_width=True, hide_index=True, height=200)
3064
  word_measures_df_csv = convert_df(word_measures_df)
3065
 
3066
  st.download_button(
@@ -3082,7 +3110,7 @@ def main():
3082
  st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
3083
 
3084
  if character_measures_df is not None:
3085
- st.dataframe(character_measures_df, use_container_width=True, hide_index=True, height=200)
3086
 
3087
  with own_analysis_tab:
3088
  st.markdown(
@@ -3114,7 +3142,7 @@ def main():
3114
  own_word_measures = reorder_columns(own_word_measures)
3115
  if "question_correct" in own_word_measures.columns:
3116
  own_word_measures = own_word_measures.drop(columns=["question_correct"])
3117
- st.dataframe(own_word_measures, use_container_width=True, hide_index=True, height=200)
3118
  own_word_measures_csv = convert_df(own_word_measures)
3119
  st.download_button(
3120
  "⏬ Download word measures data",
@@ -3142,7 +3170,7 @@ def main():
3142
  )
3143
  st.pyplot(own_word_measures_fig)
3144
  st.markdown("Sentence measures")
3145
- st.dataframe(sent_measures_multi, use_container_width=True, hide_index=True, height=200)
3146
 
3147
  else:
3148
  multi_file_tab.warning("🚨 Stimulus information needed for analysis 🚨")
@@ -3685,7 +3713,7 @@ def select_and_show_fix_sacc_feature_plots(
3685
  x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
3686
  label_start="Fixation",
3687
  ),
3688
- use_container_width=True,
3689
  )
3690
  sacc_feature_plot_col_single_asc.plotly_chart(
3691
  plot_fix_measure(
@@ -3694,7 +3722,7 @@ def select_and_show_fix_sacc_feature_plots(
3694
  x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
3695
  label_start="Saccade",
3696
  ),
3697
- use_container_width=True,
3698
  )
3699
 
3700
 
 
1483
  events_df_expander_single.markdown("### Fixations")
1484
  events_df_expander_single.dataframe(
1485
  events_df[events_df["msg"] == "FIX"].dropna(how="all", axis=1).copy(),
1486
+ width='stretch',
1487
  height=200,
1488
  )
1489
  events_df_expander_single.markdown("### Saccades")
1490
  events_df_expander_single.dataframe(
1491
  events_df[events_df["msg"] == "SAC"].dropna(how="all", axis=1).copy(),
1492
+ width='stretch',
1493
  height=200,
1494
  )
1495
  if not events_df[events_df["msg"] == "BLINK"].empty:
 
1498
  blinksdf = blinksdf.drop(
1499
  columns=[c for c in blinksdf.columns if c in ["blink", "blink_after", "blink_before"]]
1500
  )
1501
+ events_df_expander_single.dataframe(blinksdf, width='stretch', height=200)
1502
  show_cleaning_options(single_file_tab_asc_tab, events_df[events_df["msg"] == "FIX"], "single_asc")
1503
 
1504
  if "dffix_cleaned_single_asc" in st.session_state and "trial_single_asc" in st.session_state:
 
1589
  chars_colnames_markdown = read_chars_col_names()
1590
  st.markdown(chars_colnames_markdown)
1591
  df_stim_expander_single.dataframe(
1592
+ pd.DataFrame(trial["chars_list"]), width='stretch', height=200
1593
  )
1594
  if "words_list" in trial:
1595
  df_stim_expander_single.markdown("### Words dataframe")
1596
  df_stim_expander_single.dataframe(
1597
+ pd.DataFrame(trial["words_list"]), width='stretch', height=200
1598
  )
1599
  else:
1600
  st.warning("🚨 No stimulus information in session state")
 
1682
  font=selected_plotting_font_single_asc,
1683
  lines_in_plot=lines_in_plot_single_asc,
1684
  ),
1685
+ width='content',
1686
  )
1687
  plot_expander_single.markdown("#### Saccades")
1688
 
 
1712
  font=selected_plotting_font_single_asc,
1713
  lines_in_plot=lines_in_plot_single_asc,
1714
  ),
1715
+ width='content',
1716
  )
1717
  plot_expander_single.markdown("#### Y-coordinate correction due to line-assignment")
1718
  plot_expander_single.plotly_chart(
1719
+ plot_y_corr(dffix, st.session_state["algo_choice_single_asc"]), width='stretch'
1720
  )
1721
  if "average_y_corrections" in trial:
1722
  plot_expander_single.markdown(
 
1822
  with open("word_measures.md", "r") as f:
1823
  word_measure_colnames_markdown = "\n".join(f.readlines())
1824
  st.markdown(word_measure_colnames_markdown)
1825
+ st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
1826
  own_word_measures_csv = convert_df(own_word_measures)
1827
  subject = st.session_state["trial_single_asc"]["subject"]
1828
  trial_id = st.session_state["trial_single_asc"]["trial_id"]
 
1868
  st.markdown(sentence_measure_colnames_markdown)
1869
  st.dataframe(
1870
  st.session_state["own_sent_measures_single_asc"],
1871
+ width='stretch',
1872
  hide_index=True,
1873
  height=200,
1874
  )
 
1951
  get_char_measures=False,
1952
  )
1953
 
1954
+ st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
1955
  word_measures_df_csv = convert_df(word_measures_df)
1956
 
1957
  st.download_button(
 
1975
 
1976
  if character_measures_df is not None:
1977
  st.dataframe(
1978
+ character_measures_df, width='stretch', hide_index=True, height=200
1979
  )
1980
  else:
1981
  single_file_tab_asc_tab.warning("🚨 Stimulus information needed for analysis 🚨")
 
2006
  accept_multiple_files=False,
2007
  key="single_csv_file_stim_uploaded",
2008
  type={"json", "csv", "txt", "dat","jpeg","png"},
2009
+ help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes, note the filename MUST match the trial name for images(so for trial t1 the image MUST be called t1.png). This can be left blank if you chose to use the examples.",
2010
  )
2011
 
2012
  st.checkbox(
 
2043
  ]:
2044
  if k in st.session_state:
2045
  del st.session_state[k]
2046
+ def stringify_column_keep_decimal_if_needed(series):
2047
+ try:
2048
+ # Try to convert all values to float, then int
2049
+ floats = series.astype(float)
2050
+ ints = floats.astype(int)
2051
+ # If all values are equal as int and float, drop decimals
2052
+ if (floats == ints).all():
2053
+ return ints.astype(str)
2054
+ else:
2055
+ return floats.astype(str)
2056
+ except Exception:
2057
+ # If conversion fails, fallback to original as string
2058
+ return series.astype(str)
2059
  if use_example_or_uploaded_file_choice != "Example Files":
2060
  st.session_state["dffix_single_csv"] = load_csv_delim_agnostic(single_csv_file)
2061
  st.session_state["dffix_col_mappings_guess_single_csv"] = find_col_name_suggestions(
 
2075
  elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
2076
  stimdf_single_csv = recognize_text(single_csv_stim_file)
2077
  stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
2078
+ if 'trial_id' in stimdf_single_csv.columns:
2079
+ stimdf_single_csv['trial_id'] = stimdf_single_csv['trial_id'].astype(str)
2080
  st.session_state["stimdf_single_csv"] = stimdf_single_csv
2081
  colnames_stim = st.session_state["stimdf_single_csv"].columns
2082
  else:
 
2085
  st.session_state["chars_df_col_mappings_guess_single_csv"] = find_col_name_suggestions(
2086
  list(colnames_stim), COLNAMES_CUSTOM_CSV_STIM
2087
  )
2088
+ for colname in COLNAME_CANDIDATES_CUSTOM_CSV_FIX['trial_id_col_name_fix']:
2089
+ if colname in st.session_state["dffix_single_csv"].columns:
2090
+ st.session_state["dffix_single_csv"][colname] = stringify_column_keep_decimal_if_needed(st.session_state["dffix_single_csv"][colname])
2091
+
2092
  else:
2093
  with open(EXAMPLE_CUSTOM_JSON_FILE, "r") as json_file:
2094
  json_string = json_file.read()
 
2104
  if in_st_nn("dffix_single_csv"):
2105
  st.dataframe(
2106
  st.session_state["dffix_single_csv"],
2107
+ width='stretch',
2108
  hide_index=True,
2109
  on_select="ignore",
2110
  height=200,
 
2115
  else:
2116
  st.dataframe(
2117
  st.session_state["stimdf_single_csv"],
2118
+ width='stretch',
2119
  hide_index=True,
2120
  on_select="ignore",
2121
  height=200,
 
2332
  help="This downloads the corrected fixations dataframe as a .csv file with the filename containing the trial id.",
2333
  )
2334
  with single_file_tab_csv_tab.expander("Show corrected fixation data", expanded=True):
2335
+ st.dataframe(dffix, width='stretch', hide_index=True, height=200)
2336
  with single_file_tab_csv_tab.expander("Show fixation plots", expanded=True):
2337
 
2338
  plotting_checkboxes_single_single_csv = st.multiselect(
 
2350
  to_plot_list=plotting_checkboxes_single_single_csv,
2351
  algo_choice=st.session_state["algo_choice_single_csv"],
2352
  ),
2353
+ width='stretch',
2354
  )
2355
+ st.plotly_chart(plot_y_corr(dffix, st.session_state["algo_choice_single_csv"]), width='stretch')
2356
  plotlist = [x for x in dffix.columns if "Unnamed" not in str(x)]
2357
  plot_choice = st.multiselect(
2358
  "Which measures should be visualized?",
 
2360
  key="plot_choice_fix_measure",
2361
  default=plotlist[-1],
2362
  )
2363
+ st.plotly_chart(plot_fix_measure(dffix, plot_choice, "Index"), width='stretch')
2364
 
2365
  if "chars_list" in trial:
2366
  analysis_expander_custom = single_file_tab_csv_tab.expander("Show Analysis results", True)
 
2437
  fixations_tuples, textblock_input_dict, trial=trial, get_char_measures=False
2438
  )
2439
 
2440
+ st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
2441
  word_measures_df_csv = convert_df(word_measures_df)
2442
 
2443
  st.download_button(
 
2453
  st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
2454
 
2455
  if character_measures_df is not None:
2456
+ st.dataframe(character_measures_df, width='stretch', hide_index=True, height=200)
2457
 
2458
  with own_analysis_tab_custom:
2459
  st.markdown(
 
2468
  save_to_csv=True,
2469
  measures_to_calculate = ALL_MEASURES_OWN
2470
  )
2471
+ st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
2472
  own_word_measures_csv = convert_df(own_word_measures)
2473
 
2474
+ sent_measures_single_csv = compute_sentence_measures(
2475
+ dffix,
2476
+ pd.DataFrame(trial["chars_df"]),
2477
+ st.session_state["algo_choice_custom_eyekit"],
2478
+ ALL_SENT_MEASURES,
2479
+ save_to_csv=True,
2480
+ )
2481
+ st.session_state["own_sent_measures_single_csv"] = sent_measures_single_csv
2482
+ st.dataframe(sent_measures_single_csv, width='stretch', hide_index=True, height=200)
2483
+
2484
  st.download_button(
2485
  "⏬ Download word measures data",
2486
  own_word_measures_csv,
 
2831
  with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
2832
  item_colnames_markdown = read_item_col_names()
2833
  st.markdown(item_colnames_markdown)
2834
+ multi_file_tab.dataframe(st.session_state["trials_df"], width='stretch', height=200)
2835
  if in_st_nn("subjects_summary_df_multi_asc"):
2836
  multi_file_tab.markdown("### Subject level summary statistics")
2837
  with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
2838
  subject_measure_colnames_markdown = read_subject_meas_col_names()
2839
  st.markdown(subject_measure_colnames_markdown)
2840
  multi_file_tab.dataframe(
2841
+ st.session_state["subjects_summary_df_multi_asc"], width='stretch', height=200
2842
  )
2843
  if in_st_nn("trials_summary_df_multi_asc"):
2844
  multi_file_tab.markdown("### Trial level summary statistics")
 
2846
  trials_colnames_markdown = read_trial_col_names()
2847
  st.markdown(trials_colnames_markdown)
2848
  multi_file_tab.dataframe(
2849
+ st.session_state["trials_summary_df_multi_asc"], width='stretch', height=200
2850
  )
2851
 
2852
  multi_file_tab.markdown("### Combined fixations dataframe and fixation level features")
2853
  with multi_file_tab.popover("Column name definitions"):
2854
  fix_colnames_markdown = get_fix_colnames_markdown()
2855
  st.markdown(fix_colnames_markdown)
2856
+ multi_file_tab.dataframe(st.session_state["all_fix_dfs_concat_multi_asc"], width='stretch', height=200)
2857
 
2858
  high_fix_count_dfs = []
2859
  for algo_choice in st.session_state["algo_choice_multi_asc"]:
 
2883
  multi_file_tab.markdown(
2884
  "### Words that had a large number of fixations assigned to them and may need to be investigated"
2885
  )
2886
+ multi_file_tab.dataframe(high_fix_count_dfs_cat, width='stretch', height=200)
2887
  subs_str = "-".join([s for s in st.session_state["all_trials_by_subj"].keys()])
2888
  high_fix_count_dfs_cat.to_csv(RESULTS_FOLDER / f"{subs_str}_words_with_many_fixations.csv")
2889
 
2890
  if "all_correction_stats" in st.session_state:
2891
  multi_file_tab.markdown("### Correction statistics")
2892
+ multi_file_tab.dataframe(st.session_state["all_correction_stats"], width='stretch', height=200)
2893
  multi_file_tab.markdown("### Combined saccades dataframe and saccade level features")
2894
  with multi_file_tab.popover("Column name definitions"):
2895
  sac_colnames_markdown = get_sac_colnames_markdown()
2896
  st.markdown(sac_colnames_markdown)
2897
  multi_file_tab.dataframe(
2898
+ st.session_state["all_sacc_dfs_concat_multi_asc"], width='stretch', height=200
2899
  )
2900
  multi_file_tab.markdown("### Combined characters dataframe")
2901
  with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
2902
  chars_colnames_markdown = read_chars_col_names()
2903
  st.markdown(chars_colnames_markdown)
2904
  multi_file_tab.dataframe(
2905
+ st.session_state["all_chars_dfs_concat_multi_asc"], width='stretch', height=200
2906
  )
2907
 
2908
  if not st.session_state["all_own_word_measures_concat"].empty:
 
2911
  word_measure_colnames_markdown = read_word_meas_col_names()
2912
  st.markdown(word_measure_colnames_markdown)
2913
  multi_file_tab.dataframe(
2914
+ st.session_state["all_own_word_measures_concat"], width='stretch', height=200
2915
  )
2916
  if not st.session_state["all_sentence_dfs_concat_multi_asc"].empty:
2917
  multi_file_tab.markdown("### Combined sentence dataframe and sentence level features")
 
2919
  sentence_measure_colnames_markdown = read_sent_meas_col_names()
2920
  st.markdown(sentence_measure_colnames_markdown)
2921
  multi_file_tab.dataframe(
2922
+ st.session_state["all_sentence_dfs_concat_multi_asc"], width='stretch', height=200
2923
  )
2924
  if "zipfiles_with_results" in st.session_state:
2925
  multi_res_col1, multi_res_col2 = multi_file_tab.columns(2)
 
3013
  to_plot_list=plotting_checkboxes_multi,
3014
  font=selecte_plotting_font_multi_asc,
3015
  ),
3016
+ width='stretch',
3017
  )
3018
  plot_expander_multi.plotly_chart(
3019
+ plot_y_corr(dffix_multi, st.session_state["algo_choice_multi_asc"]), width='stretch'
3020
  )
3021
 
3022
  select_and_show_fix_sacc_feature_plots(
 
3088
  fixations_tuples, textblock_input_dict, trial=trial_multi, get_char_measures=False
3089
  )
3090
 
3091
+ st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
3092
  word_measures_df_csv = convert_df(word_measures_df)
3093
 
3094
  st.download_button(
 
3110
  st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
3111
 
3112
  if character_measures_df is not None:
3113
+ st.dataframe(character_measures_df, width='stretch', hide_index=True, height=200)
3114
 
3115
  with own_analysis_tab:
3116
  st.markdown(
 
3142
  own_word_measures = reorder_columns(own_word_measures)
3143
  if "question_correct" in own_word_measures.columns:
3144
  own_word_measures = own_word_measures.drop(columns=["question_correct"])
3145
+ st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
3146
  own_word_measures_csv = convert_df(own_word_measures)
3147
  st.download_button(
3148
  "⏬ Download word measures data",
 
3170
  )
3171
  st.pyplot(own_word_measures_fig)
3172
  st.markdown("Sentence measures")
3173
+ st.dataframe(sent_measures_multi, width='stretch', hide_index=True, height=200)
3174
 
3175
  else:
3176
  multi_file_tab.warning("🚨 Stimulus information needed for analysis 🚨")
 
3713
  x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
3714
  label_start="Fixation",
3715
  ),
3716
+ width='stretch',
3717
  )
3718
  sacc_feature_plot_col_single_asc.plotly_chart(
3719
  plot_fix_measure(
 
3722
  x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
3723
  label_start="Saccade",
3724
  ),
3725
+ width='stretch',
3726
  )
3727
 
3728
 
multi_proc_funcs.py CHANGED
@@ -267,7 +267,7 @@ def set_up_models(dist_models_folder):
267
  out_dict = {}
268
  dist_models_with_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_True*.ckpt"))
269
  dist_models_without_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_False*.ckpt"))
270
- DIST_MODEL_DATE_WITH_NORM = dist_models_with_norm[0].stem.split("_")[1]
271
 
272
  models_without_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_without_norm]
273
  models_with_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_with_norm]
@@ -277,7 +277,10 @@ def set_up_models(dist_models_folder):
277
 
278
  models_without_norm_df = [x[0] for x in models_without_norm_df if x[0] is not None]
279
  models_with_norm_df = [x[0] for x in models_with_norm_df if x[0] is not None]
280
-
 
 
 
281
  ensemble_model_avg = models.EnsembleModel(
282
  models_without_norm_df, models_with_norm_df, learning_rate=0.0058, use_simple_average=True
283
  )
@@ -286,9 +289,11 @@ def set_up_models(dist_models_folder):
286
  out_dict["model_cfg_without_norm_df"] = model_cfg_without_norm_df
287
  out_dict["model_cfg_with_norm_df"] = model_cfg_with_norm_df
288
 
289
- single_DIST_model, single_DIST_model_cfg = find_and_load_model(model_date=DIST_MODEL_DATE_WITH_NORM)
290
  out_dict["single_DIST_model"] = single_DIST_model
291
  out_dict["single_DIST_model_cfg"] = single_DIST_model_cfg
 
 
292
  return out_dict
293
 
294
 
 
267
  out_dict = {}
268
  dist_models_with_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_True*.ckpt"))
269
  dist_models_without_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_False*.ckpt"))
270
+ dist_model_date_with_norm = dist_models_with_norm[0].stem.split("_")[1]
271
 
272
  models_without_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_without_norm]
273
  models_with_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_with_norm]
 
277
 
278
  models_without_norm_df = [x[0] for x in models_without_norm_df if x[0] is not None]
279
  models_with_norm_df = [x[0] for x in models_with_norm_df if x[0] is not None]
280
+ if len(models_without_norm_df) == 0:
281
+ ic("No models without normalization found")
282
+ if len(models_with_norm_df) == 0:
283
+ ic("No models with normalization found")
284
  ensemble_model_avg = models.EnsembleModel(
285
  models_without_norm_df, models_with_norm_df, learning_rate=0.0058, use_simple_average=True
286
  )
 
289
  out_dict["model_cfg_without_norm_df"] = model_cfg_without_norm_df
290
  out_dict["model_cfg_with_norm_df"] = model_cfg_with_norm_df
291
 
292
+ single_DIST_model, single_DIST_model_cfg = find_and_load_model(model_date=dist_model_date_with_norm)
293
  out_dict["single_DIST_model"] = single_DIST_model
294
  out_dict["single_DIST_model_cfg"] = single_DIST_model_cfg
295
+ if single_DIST_model is None:
296
+ ic(f"Failed to load single DIST model for {dist_model_date_with_norm}")
297
  return out_dict
298
 
299
 
popEye_funcs.py CHANGED
@@ -2,6 +2,7 @@
2
  Mostly adapted from: https://github.com/sascha2schroeder/popEye
3
  """
4
 
 
5
  import numpy as np
6
  import pandas as pd
7
  from icecream import ic
@@ -886,11 +887,13 @@ def combine_words(fix, wordfirst, wordtmp, algo_choice, measures_to_calculate):
886
 
887
 
888
  def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_to_csv=False):
889
- sentitem = stimmat.drop_duplicates(
890
  subset="in_sentence_number", keep="first"
891
  ) # TODO check why there are rows with sent number None
892
  fixin = fix.copy().reset_index(drop=True)
893
-
 
 
894
  fixin["on_sentence_num2"] = fixin[f"on_sentence_num_{algo_choice}"].copy()
895
 
896
  # Recompute sentence number (two fixation exception rule)
@@ -1146,6 +1149,7 @@ def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_
1146
  item.rename({"in_sentence_number": f"on_sentence_num_{algo_choice}"}, axis=1),
1147
  on=f"on_sentence_num_{algo_choice}",
1148
  how="left",
 
1149
  )
1150
  sent[f"skip_{algo_choice}"] = 0
1151
  sent.loc[pd.isna(sent[f"nrun_{algo_choice}"]), f"skip_{algo_choice}"] = 1
 
2
  Mostly adapted from: https://github.com/sascha2schroeder/popEye
3
  """
4
 
5
+ from copy import deepcopy
6
  import numpy as np
7
  import pandas as pd
8
  from icecream import ic
 
887
 
888
 
889
  def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_to_csv=False):
890
+ sentitem = deepcopy(stimmat).drop_duplicates(
891
  subset="in_sentence_number", keep="first"
892
  ) # TODO check why there are rows with sent number None
893
  fixin = fix.copy().reset_index(drop=True)
894
+ unique_trial_ids = fixin['trial_id'].unique()
895
+ if len(unique_trial_ids) == 1:
896
+ sentitem.loc[:,'trial_id'] = unique_trial_ids[0]
897
  fixin["on_sentence_num2"] = fixin[f"on_sentence_num_{algo_choice}"].copy()
898
 
899
  # Recompute sentence number (two fixation exception rule)
 
1149
  item.rename({"in_sentence_number": f"on_sentence_num_{algo_choice}"}, axis=1),
1150
  on=f"on_sentence_num_{algo_choice}",
1151
  how="left",
1152
+ suffixes=['','from_item']
1153
  )
1154
  sent[f"skip_{algo_choice}"] = 0
1155
  sent.loc[pd.isna(sent[f"nrun_{algo_choice}"]), f"skip_{algo_choice}"] = 1
process_asc_files_in_multi_p.py CHANGED
@@ -1,5 +1,6 @@
1
  from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
2
  import json
 
3
  from sys import platform as _platform
4
  from functools import partial
5
  import multiprocessing
@@ -10,6 +11,17 @@ import sys
10
  import pandas as pd
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
13
  def get_cpu_count():
14
  if os.sys.platform in ("linux", "linux2", "darwin"):
15
  return os.cpu_count()
@@ -103,6 +115,7 @@ def main():
103
  models_dict,
104
  fix_cols_to_add_multi_asc,
105
  ) = json.loads(input_data)
 
106
  if (
107
  "DIST" in algo_choice
108
  or "Wisdom_of_Crowds_with_DIST" in algo_choice
@@ -133,16 +146,21 @@ def main():
133
  models_dict,
134
  fix_cols_to_add_multi_asc,
135
  )
 
136
  out2 = []
137
  for dffix, trial in out:
138
  dffix = dffix.to_dict("records")
139
  trial = make_json_compatible(trial)
140
  out2.append((dffix, trial))
141
  json_data_out = json.dumps(out2)
 
142
  sys.stdout.flush()
143
  print(json_data_out)
144
  except Exception as e:
 
145
  print(json.dumps({"error": str(e)}))
 
 
146
 
147
 
148
  if __name__ == "__main__":
 
1
  from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
2
  import json
3
+ import logging
4
  from sys import platform as _platform
5
  from functools import partial
6
  import multiprocessing
 
11
  import pandas as pd
12
 
13
 
14
+ logger = logging.getLogger('multiproc')
15
+ logger.propagate = False
16
+ logger.setLevel("DEBUG")
17
+ ch = logging.FileHandler("MULTI_LOG.log", "a")
18
+ ch.setFormatter(
19
+ logging.Formatter(
20
+ "%(asctime)s-{%(filename)s:%(lineno)d}-%(levelname)s >>> %(message)s",
21
+ "%m-%d %H:%M:%S",
22
+ )
23
+ )
24
+ logger.addHandler(ch)
25
  def get_cpu_count():
26
  if os.sys.platform in ("linux", "linux2", "darwin"):
27
  return os.cpu_count()
 
115
  models_dict,
116
  fix_cols_to_add_multi_asc,
117
  ) = json.loads(input_data)
118
+ logger.info("Finished loading from json")
119
  if (
120
  "DIST" in algo_choice
121
  or "Wisdom_of_Crowds_with_DIST" in algo_choice
 
146
  models_dict,
147
  fix_cols_to_add_multi_asc,
148
  )
149
+ logger.info("Finished process_asc_files_in_multi_proc")
150
  out2 = []
151
  for dffix, trial in out:
152
  dffix = dffix.to_dict("records")
153
  trial = make_json_compatible(trial)
154
  out2.append((dffix, trial))
155
  json_data_out = json.dumps(out2)
156
+ logger.info("Finished appending")
157
  sys.stdout.flush()
158
  print(json_data_out)
159
  except Exception as e:
160
+ logger.warning(e)
161
  print(json.dumps({"error": str(e)}))
162
+ with open("MULTI_ERROR.log",'w') as f:
163
+ f.write(e)
164
 
165
 
166
  if __name__ == "__main__":
utils.py CHANGED
@@ -198,6 +198,7 @@ def calc_xdiff_ydiff(line_xcoords_no_pad, line_ycoords_no_pad, line_heights, all
198
 
199
 
200
  def add_words(chars_list):
 
201
  chars_list_reconstructed = []
202
  words_list = []
203
  sentence_list = []
@@ -210,6 +211,21 @@ def add_words(chars_list):
210
  on_line_num = -1
211
  line_change_on_next_char = False
212
  num_chars = len(chars_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  for idx, char_dict in enumerate(chars_list):
214
  # check if line change will happen after current char
215
  on_line_num = char_dict["assigned_line"]
@@ -222,23 +238,18 @@ def add_words(chars_list):
222
  word_xmin = chars_list_reconstructed[word_start_idx]["char_xmin"]
223
  if chars_list_reconstructed[-1]["char"] == " " and len(chars_list_reconstructed) != 1:
224
  word_xmax = chars_list_reconstructed[-2]["char_xmax"]
225
-
226
- word = "".join(
227
- [
228
- chars_list_reconstructed[idx]["char"]
229
- for idx in range(word_start_idx, len(chars_list_reconstructed) - 1)
230
- ]
231
- )
232
  elif len(chars_list_reconstructed) == 1:
233
  word_xmax = chars_list_reconstructed[-1]["char_xmax"]
234
  word = " "
235
  else:
236
- word = "".join(
237
- [
238
- chars_list_reconstructed[idx]["char"]
239
- for idx in range(word_start_idx, len(chars_list_reconstructed))
240
- ]
241
- )
242
  word_xmax = chars_list_reconstructed[-1]["char_xmax"]
243
  word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
244
  word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
@@ -269,10 +280,9 @@ def add_words(chars_list):
269
  char_dict["in_word_number"] = len(words_list) - 1
270
  char_dict["in_word"] = word
271
  char_dict["num_letters_from_start_of_word"] = cidx
272
-
273
  word_start_idx = idx + 1
274
-
275
- if chars_list_reconstructed[-1]["char"] in [".", "!", "?"] or idx == (len(chars_list) - 1):
276
  if idx != sentence_start_idx:
277
  chars_df_temp = pd.DataFrame(chars_list_reconstructed[sentence_start_idx:])
278
  line_texts = []
@@ -301,7 +311,6 @@ def add_words(chars_list):
301
  ):
302
  char_dict["in_word_number"] = chars_list_reconstructed[cidx + 1]["in_word_number"]
303
  char_dict["in_word"] = chars_list_reconstructed[cidx + 1]["in_word"]
304
-
305
  last_letter_in_word = words_list[-1]["word"][-1]
306
  last_letter_in_chars_list_reconstructed = char_dict["char"]
307
  if last_letter_in_word != last_letter_in_chars_list_reconstructed:
@@ -316,7 +325,6 @@ def add_words(chars_list):
316
  word_ymax=words_list[-1]["word_ymax"],
317
  assigned_line=assigned_line,
318
  )
319
-
320
  word_x_center = round(
321
  (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
322
  )
@@ -347,7 +355,24 @@ def add_words(chars_list):
347
  chars_list_reconstructed[-1]["in_sentence"] = sentence_list[-1]["sentence_text"]
348
  else:
349
  ic(f"Warning Sentence list empty: {sentence_list}")
350
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  return words_list, chars_list_reconstructed
352
 
353
 
@@ -1520,6 +1545,7 @@ def download_example_ascs(EXAMPLES_FOLDER, EXAMPLES_ASC_ZIP_FILENAME, OSF_DOWNLA
1520
  ic(f"Extracting {EXAMPLES_ASC_ZIP_FILENAME} failed")
1521
 
1522
  EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
 
1523
  else:
1524
  EXAMPLE_ASC_FILES = []
1525
  return EXAMPLE_ASC_FILES
 
198
 
199
 
200
  def add_words(chars_list):
201
+
202
  chars_list_reconstructed = []
203
  words_list = []
204
  sentence_list = []
 
211
  on_line_num = -1
212
  line_change_on_next_char = False
213
  num_chars = len(chars_list)
214
+ # Helper: is this char a sentence boundary?
215
+ def is_sentence_boundary(idx):
216
+ c = chars_list_reconstructed[idx]["char"]
217
+ if c not in ".!?":
218
+ return False
219
+ # Check if previous and next chars are digits (for numbers like 41.2)
220
+ prev_c = chars_list_reconstructed[idx-1]["char"] if idx > 0 else None
221
+ if prev_c == chars_list[idx-1]["char"]:
222
+ next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
223
+ if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
224
+ return False
225
+ # Optionally: only treat as boundary if followed by space or end
226
+ if next_c and next_c not in [None, " ", "\n"]:
227
+ return False
228
+ return True
229
  for idx, char_dict in enumerate(chars_list):
230
  # check if line change will happen after current char
231
  on_line_num = char_dict["assigned_line"]
 
238
  word_xmin = chars_list_reconstructed[word_start_idx]["char_xmin"]
239
  if chars_list_reconstructed[-1]["char"] == " " and len(chars_list_reconstructed) != 1:
240
  word_xmax = chars_list_reconstructed[-2]["char_xmax"]
241
+ word = "".join([
242
+ chars_list_reconstructed[idx]["char"]
243
+ for idx in range(word_start_idx, len(chars_list_reconstructed) - 1)
244
+ ])
 
 
 
245
  elif len(chars_list_reconstructed) == 1:
246
  word_xmax = chars_list_reconstructed[-1]["char_xmax"]
247
  word = " "
248
  else:
249
+ word = "".join([
250
+ chars_list_reconstructed[idx]["char"]
251
+ for idx in range(word_start_idx, len(chars_list_reconstructed))
252
+ ])
 
 
253
  word_xmax = chars_list_reconstructed[-1]["char_xmax"]
254
  word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
255
  word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
 
280
  char_dict["in_word_number"] = len(words_list) - 1
281
  char_dict["in_word"] = word
282
  char_dict["num_letters_from_start_of_word"] = cidx
 
283
  word_start_idx = idx + 1
284
+ # Improved sentence boundary detection
285
+ if is_sentence_boundary(len(chars_list_reconstructed)-1) or idx == (len(chars_list) - 1):
286
  if idx != sentence_start_idx:
287
  chars_df_temp = pd.DataFrame(chars_list_reconstructed[sentence_start_idx:])
288
  line_texts = []
 
311
  ):
312
  char_dict["in_word_number"] = chars_list_reconstructed[cidx + 1]["in_word_number"]
313
  char_dict["in_word"] = chars_list_reconstructed[cidx + 1]["in_word"]
 
314
  last_letter_in_word = words_list[-1]["word"][-1]
315
  last_letter_in_chars_list_reconstructed = char_dict["char"]
316
  if last_letter_in_word != last_letter_in_chars_list_reconstructed:
 
325
  word_ymax=words_list[-1]["word_ymax"],
326
  assigned_line=assigned_line,
327
  )
 
328
  word_x_center = round(
329
  (words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
330
  )
 
355
  chars_list_reconstructed[-1]["in_sentence"] = sentence_list[-1]["sentence_text"]
356
  else:
357
  ic(f"Warning Sentence list empty: {sentence_list}")
358
+ # Add 'in_sentence' and 'in_sentence_number' keys to words_list using 'in_word_number' from chars_list_reconstructed
359
+ # Build a mapping from in_word_number to (in_sentence, in_sentence_number)
360
+ word_to_sentence = {}
361
+ word_to_sentence_number = {}
362
+ for char in chars_list_reconstructed:
363
+ if "in_word_number" in char and "in_sentence" in char and "in_sentence_number" in char:
364
+ word_to_sentence[char["in_word_number"]] = char["in_sentence"]
365
+ word_to_sentence_number[char["in_word_number"]] = char["in_sentence_number"]
366
+ for word in words_list:
367
+ word_num = word.get("word_number")
368
+ if word_num in word_to_sentence:
369
+ word["in_sentence"] = word_to_sentence[word_num]
370
+ else:
371
+ word["in_sentence"] = None
372
+ if word_num in word_to_sentence_number:
373
+ word["in_sentence_number"] = word_to_sentence_number[word_num]
374
+ else:
375
+ word["in_sentence_number"] = None
376
  return words_list, chars_list_reconstructed
377
 
378
 
 
1545
  ic(f"Extracting {EXAMPLES_ASC_ZIP_FILENAME} failed")
1546
 
1547
  EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
1548
+
1549
  else:
1550
  EXAMPLE_ASC_FILES = []
1551
  return EXAMPLE_ASC_FILES