synced with dev version mainly sent measure related
Browse files- app.py +69 -41
- multi_proc_funcs.py +8 -3
- popEye_funcs.py +6 -2
- process_asc_files_in_multi_p.py +18 -0
- utils.py +45 -19
app.py
CHANGED
|
@@ -1483,13 +1483,13 @@ def main():
|
|
| 1483 |
events_df_expander_single.markdown("### Fixations")
|
| 1484 |
events_df_expander_single.dataframe(
|
| 1485 |
events_df[events_df["msg"] == "FIX"].dropna(how="all", axis=1).copy(),
|
| 1486 |
-
|
| 1487 |
height=200,
|
| 1488 |
)
|
| 1489 |
events_df_expander_single.markdown("### Saccades")
|
| 1490 |
events_df_expander_single.dataframe(
|
| 1491 |
events_df[events_df["msg"] == "SAC"].dropna(how="all", axis=1).copy(),
|
| 1492 |
-
|
| 1493 |
height=200,
|
| 1494 |
)
|
| 1495 |
if not events_df[events_df["msg"] == "BLINK"].empty:
|
|
@@ -1498,7 +1498,7 @@ def main():
|
|
| 1498 |
blinksdf = blinksdf.drop(
|
| 1499 |
columns=[c for c in blinksdf.columns if c in ["blink", "blink_after", "blink_before"]]
|
| 1500 |
)
|
| 1501 |
-
events_df_expander_single.dataframe(blinksdf,
|
| 1502 |
show_cleaning_options(single_file_tab_asc_tab, events_df[events_df["msg"] == "FIX"], "single_asc")
|
| 1503 |
|
| 1504 |
if "dffix_cleaned_single_asc" in st.session_state and "trial_single_asc" in st.session_state:
|
|
@@ -1589,12 +1589,12 @@ def main():
|
|
| 1589 |
chars_colnames_markdown = read_chars_col_names()
|
| 1590 |
st.markdown(chars_colnames_markdown)
|
| 1591 |
df_stim_expander_single.dataframe(
|
| 1592 |
-
pd.DataFrame(trial["chars_list"]),
|
| 1593 |
)
|
| 1594 |
if "words_list" in trial:
|
| 1595 |
df_stim_expander_single.markdown("### Words dataframe")
|
| 1596 |
df_stim_expander_single.dataframe(
|
| 1597 |
-
pd.DataFrame(trial["words_list"]),
|
| 1598 |
)
|
| 1599 |
else:
|
| 1600 |
st.warning("🚨 No stimulus information in session state")
|
|
@@ -1682,7 +1682,7 @@ def main():
|
|
| 1682 |
font=selected_plotting_font_single_asc,
|
| 1683 |
lines_in_plot=lines_in_plot_single_asc,
|
| 1684 |
),
|
| 1685 |
-
|
| 1686 |
)
|
| 1687 |
plot_expander_single.markdown("#### Saccades")
|
| 1688 |
|
|
@@ -1712,11 +1712,11 @@ def main():
|
|
| 1712 |
font=selected_plotting_font_single_asc,
|
| 1713 |
lines_in_plot=lines_in_plot_single_asc,
|
| 1714 |
),
|
| 1715 |
-
|
| 1716 |
)
|
| 1717 |
plot_expander_single.markdown("#### Y-coordinate correction due to line-assignment")
|
| 1718 |
plot_expander_single.plotly_chart(
|
| 1719 |
-
plot_y_corr(dffix, st.session_state["algo_choice_single_asc"]),
|
| 1720 |
)
|
| 1721 |
if "average_y_corrections" in trial:
|
| 1722 |
plot_expander_single.markdown(
|
|
@@ -1822,7 +1822,7 @@ def main():
|
|
| 1822 |
with open("word_measures.md", "r") as f:
|
| 1823 |
word_measure_colnames_markdown = "\n".join(f.readlines())
|
| 1824 |
st.markdown(word_measure_colnames_markdown)
|
| 1825 |
-
st.dataframe(own_word_measures,
|
| 1826 |
own_word_measures_csv = convert_df(own_word_measures)
|
| 1827 |
subject = st.session_state["trial_single_asc"]["subject"]
|
| 1828 |
trial_id = st.session_state["trial_single_asc"]["trial_id"]
|
|
@@ -1868,7 +1868,7 @@ def main():
|
|
| 1868 |
st.markdown(sentence_measure_colnames_markdown)
|
| 1869 |
st.dataframe(
|
| 1870 |
st.session_state["own_sent_measures_single_asc"],
|
| 1871 |
-
|
| 1872 |
hide_index=True,
|
| 1873 |
height=200,
|
| 1874 |
)
|
|
@@ -1951,7 +1951,7 @@ def main():
|
|
| 1951 |
get_char_measures=False,
|
| 1952 |
)
|
| 1953 |
|
| 1954 |
-
st.dataframe(word_measures_df,
|
| 1955 |
word_measures_df_csv = convert_df(word_measures_df)
|
| 1956 |
|
| 1957 |
st.download_button(
|
|
@@ -1975,7 +1975,7 @@ def main():
|
|
| 1975 |
|
| 1976 |
if character_measures_df is not None:
|
| 1977 |
st.dataframe(
|
| 1978 |
-
character_measures_df,
|
| 1979 |
)
|
| 1980 |
else:
|
| 1981 |
single_file_tab_asc_tab.warning("🚨 Stimulus information needed for analysis 🚨")
|
|
@@ -2006,7 +2006,7 @@ def main():
|
|
| 2006 |
accept_multiple_files=False,
|
| 2007 |
key="single_csv_file_stim_uploaded",
|
| 2008 |
type={"json", "csv", "txt", "dat","jpeg","png"},
|
| 2009 |
-
help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes. This can be left blank if you chose to use the examples.",
|
| 2010 |
)
|
| 2011 |
|
| 2012 |
st.checkbox(
|
|
@@ -2043,7 +2043,19 @@ def main():
|
|
| 2043 |
]:
|
| 2044 |
if k in st.session_state:
|
| 2045 |
del st.session_state[k]
|
| 2046 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2047 |
if use_example_or_uploaded_file_choice != "Example Files":
|
| 2048 |
st.session_state["dffix_single_csv"] = load_csv_delim_agnostic(single_csv_file)
|
| 2049 |
st.session_state["dffix_col_mappings_guess_single_csv"] = find_col_name_suggestions(
|
|
@@ -2063,6 +2075,8 @@ def main():
|
|
| 2063 |
elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
|
| 2064 |
stimdf_single_csv = recognize_text(single_csv_stim_file)
|
| 2065 |
stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
|
|
|
|
|
|
|
| 2066 |
st.session_state["stimdf_single_csv"] = stimdf_single_csv
|
| 2067 |
colnames_stim = st.session_state["stimdf_single_csv"].columns
|
| 2068 |
else:
|
|
@@ -2071,6 +2085,10 @@ def main():
|
|
| 2071 |
st.session_state["chars_df_col_mappings_guess_single_csv"] = find_col_name_suggestions(
|
| 2072 |
list(colnames_stim), COLNAMES_CUSTOM_CSV_STIM
|
| 2073 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2074 |
else:
|
| 2075 |
with open(EXAMPLE_CUSTOM_JSON_FILE, "r") as json_file:
|
| 2076 |
json_string = json_file.read()
|
|
@@ -2086,7 +2104,7 @@ def main():
|
|
| 2086 |
if in_st_nn("dffix_single_csv"):
|
| 2087 |
st.dataframe(
|
| 2088 |
st.session_state["dffix_single_csv"],
|
| 2089 |
-
|
| 2090 |
hide_index=True,
|
| 2091 |
on_select="ignore",
|
| 2092 |
height=200,
|
|
@@ -2097,7 +2115,7 @@ def main():
|
|
| 2097 |
else:
|
| 2098 |
st.dataframe(
|
| 2099 |
st.session_state["stimdf_single_csv"],
|
| 2100 |
-
|
| 2101 |
hide_index=True,
|
| 2102 |
on_select="ignore",
|
| 2103 |
height=200,
|
|
@@ -2314,7 +2332,7 @@ def main():
|
|
| 2314 |
help="This downloads the corrected fixations dataframe as a .csv file with the filename containing the trial id.",
|
| 2315 |
)
|
| 2316 |
with single_file_tab_csv_tab.expander("Show corrected fixation data", expanded=True):
|
| 2317 |
-
st.dataframe(dffix,
|
| 2318 |
with single_file_tab_csv_tab.expander("Show fixation plots", expanded=True):
|
| 2319 |
|
| 2320 |
plotting_checkboxes_single_single_csv = st.multiselect(
|
|
@@ -2332,9 +2350,9 @@ def main():
|
|
| 2332 |
to_plot_list=plotting_checkboxes_single_single_csv,
|
| 2333 |
algo_choice=st.session_state["algo_choice_single_csv"],
|
| 2334 |
),
|
| 2335 |
-
|
| 2336 |
)
|
| 2337 |
-
st.plotly_chart(plot_y_corr(dffix, st.session_state["algo_choice_single_csv"]),
|
| 2338 |
plotlist = [x for x in dffix.columns if "Unnamed" not in str(x)]
|
| 2339 |
plot_choice = st.multiselect(
|
| 2340 |
"Which measures should be visualized?",
|
|
@@ -2342,7 +2360,7 @@ def main():
|
|
| 2342 |
key="plot_choice_fix_measure",
|
| 2343 |
default=plotlist[-1],
|
| 2344 |
)
|
| 2345 |
-
st.plotly_chart(plot_fix_measure(dffix, plot_choice, "Index"),
|
| 2346 |
|
| 2347 |
if "chars_list" in trial:
|
| 2348 |
analysis_expander_custom = single_file_tab_csv_tab.expander("Show Analysis results", True)
|
|
@@ -2419,7 +2437,7 @@ def main():
|
|
| 2419 |
fixations_tuples, textblock_input_dict, trial=trial, get_char_measures=False
|
| 2420 |
)
|
| 2421 |
|
| 2422 |
-
st.dataframe(word_measures_df,
|
| 2423 |
word_measures_df_csv = convert_df(word_measures_df)
|
| 2424 |
|
| 2425 |
st.download_button(
|
|
@@ -2435,7 +2453,7 @@ def main():
|
|
| 2435 |
st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
|
| 2436 |
|
| 2437 |
if character_measures_df is not None:
|
| 2438 |
-
st.dataframe(character_measures_df,
|
| 2439 |
|
| 2440 |
with own_analysis_tab_custom:
|
| 2441 |
st.markdown(
|
|
@@ -2450,9 +2468,19 @@ def main():
|
|
| 2450 |
save_to_csv=True,
|
| 2451 |
measures_to_calculate = ALL_MEASURES_OWN
|
| 2452 |
)
|
| 2453 |
-
st.dataframe(own_word_measures,
|
| 2454 |
own_word_measures_csv = convert_df(own_word_measures)
|
| 2455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2456 |
st.download_button(
|
| 2457 |
"⏬ Download word measures data",
|
| 2458 |
own_word_measures_csv,
|
|
@@ -2803,14 +2831,14 @@ def main():
|
|
| 2803 |
with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
|
| 2804 |
item_colnames_markdown = read_item_col_names()
|
| 2805 |
st.markdown(item_colnames_markdown)
|
| 2806 |
-
multi_file_tab.dataframe(st.session_state["trials_df"],
|
| 2807 |
if in_st_nn("subjects_summary_df_multi_asc"):
|
| 2808 |
multi_file_tab.markdown("### Subject level summary statistics")
|
| 2809 |
with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
|
| 2810 |
subject_measure_colnames_markdown = read_subject_meas_col_names()
|
| 2811 |
st.markdown(subject_measure_colnames_markdown)
|
| 2812 |
multi_file_tab.dataframe(
|
| 2813 |
-
st.session_state["subjects_summary_df_multi_asc"],
|
| 2814 |
)
|
| 2815 |
if in_st_nn("trials_summary_df_multi_asc"):
|
| 2816 |
multi_file_tab.markdown("### Trial level summary statistics")
|
|
@@ -2818,14 +2846,14 @@ def main():
|
|
| 2818 |
trials_colnames_markdown = read_trial_col_names()
|
| 2819 |
st.markdown(trials_colnames_markdown)
|
| 2820 |
multi_file_tab.dataframe(
|
| 2821 |
-
st.session_state["trials_summary_df_multi_asc"],
|
| 2822 |
)
|
| 2823 |
|
| 2824 |
multi_file_tab.markdown("### Combined fixations dataframe and fixation level features")
|
| 2825 |
with multi_file_tab.popover("Column name definitions"):
|
| 2826 |
fix_colnames_markdown = get_fix_colnames_markdown()
|
| 2827 |
st.markdown(fix_colnames_markdown)
|
| 2828 |
-
multi_file_tab.dataframe(st.session_state["all_fix_dfs_concat_multi_asc"],
|
| 2829 |
|
| 2830 |
high_fix_count_dfs = []
|
| 2831 |
for algo_choice in st.session_state["algo_choice_multi_asc"]:
|
|
@@ -2855,26 +2883,26 @@ def main():
|
|
| 2855 |
multi_file_tab.markdown(
|
| 2856 |
"### Words that had a large number of fixations assigned to them and may need to be investigated"
|
| 2857 |
)
|
| 2858 |
-
multi_file_tab.dataframe(high_fix_count_dfs_cat,
|
| 2859 |
subs_str = "-".join([s for s in st.session_state["all_trials_by_subj"].keys()])
|
| 2860 |
high_fix_count_dfs_cat.to_csv(RESULTS_FOLDER / f"{subs_str}_words_with_many_fixations.csv")
|
| 2861 |
|
| 2862 |
if "all_correction_stats" in st.session_state:
|
| 2863 |
multi_file_tab.markdown("### Correction statistics")
|
| 2864 |
-
multi_file_tab.dataframe(st.session_state["all_correction_stats"],
|
| 2865 |
multi_file_tab.markdown("### Combined saccades dataframe and saccade level features")
|
| 2866 |
with multi_file_tab.popover("Column name definitions"):
|
| 2867 |
sac_colnames_markdown = get_sac_colnames_markdown()
|
| 2868 |
st.markdown(sac_colnames_markdown)
|
| 2869 |
multi_file_tab.dataframe(
|
| 2870 |
-
st.session_state["all_sacc_dfs_concat_multi_asc"],
|
| 2871 |
)
|
| 2872 |
multi_file_tab.markdown("### Combined characters dataframe")
|
| 2873 |
with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
|
| 2874 |
chars_colnames_markdown = read_chars_col_names()
|
| 2875 |
st.markdown(chars_colnames_markdown)
|
| 2876 |
multi_file_tab.dataframe(
|
| 2877 |
-
st.session_state["all_chars_dfs_concat_multi_asc"],
|
| 2878 |
)
|
| 2879 |
|
| 2880 |
if not st.session_state["all_own_word_measures_concat"].empty:
|
|
@@ -2883,7 +2911,7 @@ def main():
|
|
| 2883 |
word_measure_colnames_markdown = read_word_meas_col_names()
|
| 2884 |
st.markdown(word_measure_colnames_markdown)
|
| 2885 |
multi_file_tab.dataframe(
|
| 2886 |
-
st.session_state["all_own_word_measures_concat"],
|
| 2887 |
)
|
| 2888 |
if not st.session_state["all_sentence_dfs_concat_multi_asc"].empty:
|
| 2889 |
multi_file_tab.markdown("### Combined sentence dataframe and sentence level features")
|
|
@@ -2891,7 +2919,7 @@ def main():
|
|
| 2891 |
sentence_measure_colnames_markdown = read_sent_meas_col_names()
|
| 2892 |
st.markdown(sentence_measure_colnames_markdown)
|
| 2893 |
multi_file_tab.dataframe(
|
| 2894 |
-
st.session_state["all_sentence_dfs_concat_multi_asc"],
|
| 2895 |
)
|
| 2896 |
if "zipfiles_with_results" in st.session_state:
|
| 2897 |
multi_res_col1, multi_res_col2 = multi_file_tab.columns(2)
|
|
@@ -2985,10 +3013,10 @@ def main():
|
|
| 2985 |
to_plot_list=plotting_checkboxes_multi,
|
| 2986 |
font=selecte_plotting_font_multi_asc,
|
| 2987 |
),
|
| 2988 |
-
|
| 2989 |
)
|
| 2990 |
plot_expander_multi.plotly_chart(
|
| 2991 |
-
plot_y_corr(dffix_multi, st.session_state["algo_choice_multi_asc"]),
|
| 2992 |
)
|
| 2993 |
|
| 2994 |
select_and_show_fix_sacc_feature_plots(
|
|
@@ -3060,7 +3088,7 @@ def main():
|
|
| 3060 |
fixations_tuples, textblock_input_dict, trial=trial_multi, get_char_measures=False
|
| 3061 |
)
|
| 3062 |
|
| 3063 |
-
st.dataframe(word_measures_df,
|
| 3064 |
word_measures_df_csv = convert_df(word_measures_df)
|
| 3065 |
|
| 3066 |
st.download_button(
|
|
@@ -3082,7 +3110,7 @@ def main():
|
|
| 3082 |
st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
|
| 3083 |
|
| 3084 |
if character_measures_df is not None:
|
| 3085 |
-
st.dataframe(character_measures_df,
|
| 3086 |
|
| 3087 |
with own_analysis_tab:
|
| 3088 |
st.markdown(
|
|
@@ -3114,7 +3142,7 @@ def main():
|
|
| 3114 |
own_word_measures = reorder_columns(own_word_measures)
|
| 3115 |
if "question_correct" in own_word_measures.columns:
|
| 3116 |
own_word_measures = own_word_measures.drop(columns=["question_correct"])
|
| 3117 |
-
st.dataframe(own_word_measures,
|
| 3118 |
own_word_measures_csv = convert_df(own_word_measures)
|
| 3119 |
st.download_button(
|
| 3120 |
"⏬ Download word measures data",
|
|
@@ -3142,7 +3170,7 @@ def main():
|
|
| 3142 |
)
|
| 3143 |
st.pyplot(own_word_measures_fig)
|
| 3144 |
st.markdown("Sentence measures")
|
| 3145 |
-
st.dataframe(sent_measures_multi,
|
| 3146 |
|
| 3147 |
else:
|
| 3148 |
multi_file_tab.warning("🚨 Stimulus information needed for analysis 🚨")
|
|
@@ -3685,7 +3713,7 @@ def select_and_show_fix_sacc_feature_plots(
|
|
| 3685 |
x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
|
| 3686 |
label_start="Fixation",
|
| 3687 |
),
|
| 3688 |
-
|
| 3689 |
)
|
| 3690 |
sacc_feature_plot_col_single_asc.plotly_chart(
|
| 3691 |
plot_fix_measure(
|
|
@@ -3694,7 +3722,7 @@ def select_and_show_fix_sacc_feature_plots(
|
|
| 3694 |
x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
|
| 3695 |
label_start="Saccade",
|
| 3696 |
),
|
| 3697 |
-
|
| 3698 |
)
|
| 3699 |
|
| 3700 |
|
|
|
|
| 1483 |
events_df_expander_single.markdown("### Fixations")
|
| 1484 |
events_df_expander_single.dataframe(
|
| 1485 |
events_df[events_df["msg"] == "FIX"].dropna(how="all", axis=1).copy(),
|
| 1486 |
+
width='stretch',
|
| 1487 |
height=200,
|
| 1488 |
)
|
| 1489 |
events_df_expander_single.markdown("### Saccades")
|
| 1490 |
events_df_expander_single.dataframe(
|
| 1491 |
events_df[events_df["msg"] == "SAC"].dropna(how="all", axis=1).copy(),
|
| 1492 |
+
width='stretch',
|
| 1493 |
height=200,
|
| 1494 |
)
|
| 1495 |
if not events_df[events_df["msg"] == "BLINK"].empty:
|
|
|
|
| 1498 |
blinksdf = blinksdf.drop(
|
| 1499 |
columns=[c for c in blinksdf.columns if c in ["blink", "blink_after", "blink_before"]]
|
| 1500 |
)
|
| 1501 |
+
events_df_expander_single.dataframe(blinksdf, width='stretch', height=200)
|
| 1502 |
show_cleaning_options(single_file_tab_asc_tab, events_df[events_df["msg"] == "FIX"], "single_asc")
|
| 1503 |
|
| 1504 |
if "dffix_cleaned_single_asc" in st.session_state and "trial_single_asc" in st.session_state:
|
|
|
|
| 1589 |
chars_colnames_markdown = read_chars_col_names()
|
| 1590 |
st.markdown(chars_colnames_markdown)
|
| 1591 |
df_stim_expander_single.dataframe(
|
| 1592 |
+
pd.DataFrame(trial["chars_list"]), width='stretch', height=200
|
| 1593 |
)
|
| 1594 |
if "words_list" in trial:
|
| 1595 |
df_stim_expander_single.markdown("### Words dataframe")
|
| 1596 |
df_stim_expander_single.dataframe(
|
| 1597 |
+
pd.DataFrame(trial["words_list"]), width='stretch', height=200
|
| 1598 |
)
|
| 1599 |
else:
|
| 1600 |
st.warning("🚨 No stimulus information in session state")
|
|
|
|
| 1682 |
font=selected_plotting_font_single_asc,
|
| 1683 |
lines_in_plot=lines_in_plot_single_asc,
|
| 1684 |
),
|
| 1685 |
+
width='content',
|
| 1686 |
)
|
| 1687 |
plot_expander_single.markdown("#### Saccades")
|
| 1688 |
|
|
|
|
| 1712 |
font=selected_plotting_font_single_asc,
|
| 1713 |
lines_in_plot=lines_in_plot_single_asc,
|
| 1714 |
),
|
| 1715 |
+
width='content',
|
| 1716 |
)
|
| 1717 |
plot_expander_single.markdown("#### Y-coordinate correction due to line-assignment")
|
| 1718 |
plot_expander_single.plotly_chart(
|
| 1719 |
+
plot_y_corr(dffix, st.session_state["algo_choice_single_asc"]), width='stretch'
|
| 1720 |
)
|
| 1721 |
if "average_y_corrections" in trial:
|
| 1722 |
plot_expander_single.markdown(
|
|
|
|
| 1822 |
with open("word_measures.md", "r") as f:
|
| 1823 |
word_measure_colnames_markdown = "\n".join(f.readlines())
|
| 1824 |
st.markdown(word_measure_colnames_markdown)
|
| 1825 |
+
st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
|
| 1826 |
own_word_measures_csv = convert_df(own_word_measures)
|
| 1827 |
subject = st.session_state["trial_single_asc"]["subject"]
|
| 1828 |
trial_id = st.session_state["trial_single_asc"]["trial_id"]
|
|
|
|
| 1868 |
st.markdown(sentence_measure_colnames_markdown)
|
| 1869 |
st.dataframe(
|
| 1870 |
st.session_state["own_sent_measures_single_asc"],
|
| 1871 |
+
width='stretch',
|
| 1872 |
hide_index=True,
|
| 1873 |
height=200,
|
| 1874 |
)
|
|
|
|
| 1951 |
get_char_measures=False,
|
| 1952 |
)
|
| 1953 |
|
| 1954 |
+
st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
|
| 1955 |
word_measures_df_csv = convert_df(word_measures_df)
|
| 1956 |
|
| 1957 |
st.download_button(
|
|
|
|
| 1975 |
|
| 1976 |
if character_measures_df is not None:
|
| 1977 |
st.dataframe(
|
| 1978 |
+
character_measures_df, width='stretch', hide_index=True, height=200
|
| 1979 |
)
|
| 1980 |
else:
|
| 1981 |
single_file_tab_asc_tab.warning("🚨 Stimulus information needed for analysis 🚨")
|
|
|
|
| 2006 |
accept_multiple_files=False,
|
| 2007 |
key="single_csv_file_stim_uploaded",
|
| 2008 |
type={"json", "csv", "txt", "dat","jpeg","png"},
|
| 2009 |
+
help="Drag and drop or select a single .json, .csv, .txt, .dat, jpeg or png file that you wish to process as the stimulus file for the uploaded fixation data. If an image is uploaded OCR will be attempted to extract the character bounding boxes, note the filename MUST match the trial name for images(so for trial t1 the image MUST be called t1.png). This can be left blank if you chose to use the examples.",
|
| 2010 |
)
|
| 2011 |
|
| 2012 |
st.checkbox(
|
|
|
|
| 2043 |
]:
|
| 2044 |
if k in st.session_state:
|
| 2045 |
del st.session_state[k]
|
| 2046 |
+
def stringify_column_keep_decimal_if_needed(series):
|
| 2047 |
+
try:
|
| 2048 |
+
# Try to convert all values to float, then int
|
| 2049 |
+
floats = series.astype(float)
|
| 2050 |
+
ints = floats.astype(int)
|
| 2051 |
+
# If all values are equal as int and float, drop decimals
|
| 2052 |
+
if (floats == ints).all():
|
| 2053 |
+
return ints.astype(str)
|
| 2054 |
+
else:
|
| 2055 |
+
return floats.astype(str)
|
| 2056 |
+
except Exception:
|
| 2057 |
+
# If conversion fails, fallback to original as string
|
| 2058 |
+
return series.astype(str)
|
| 2059 |
if use_example_or_uploaded_file_choice != "Example Files":
|
| 2060 |
st.session_state["dffix_single_csv"] = load_csv_delim_agnostic(single_csv_file)
|
| 2061 |
st.session_state["dffix_col_mappings_guess_single_csv"] = find_col_name_suggestions(
|
|
|
|
| 2075 |
elif any([".png" in single_csv_stim_file.name, ".jpeg" in single_csv_stim_file.name]):
|
| 2076 |
stimdf_single_csv = recognize_text(single_csv_stim_file)
|
| 2077 |
stimdf_single_csv.to_csv(RESULTS_FOLDER / f"{single_csv_stim_file.name}_stimdf_single_from_OCR.csv")
|
| 2078 |
+
if 'trial_id' in stimdf_single_csv.columns:
|
| 2079 |
+
stimdf_single_csv['trial_id'] = stimdf_single_csv['trial_id'].astype(str)
|
| 2080 |
st.session_state["stimdf_single_csv"] = stimdf_single_csv
|
| 2081 |
colnames_stim = st.session_state["stimdf_single_csv"].columns
|
| 2082 |
else:
|
|
|
|
| 2085 |
st.session_state["chars_df_col_mappings_guess_single_csv"] = find_col_name_suggestions(
|
| 2086 |
list(colnames_stim), COLNAMES_CUSTOM_CSV_STIM
|
| 2087 |
)
|
| 2088 |
+
for colname in COLNAME_CANDIDATES_CUSTOM_CSV_FIX['trial_id_col_name_fix']:
|
| 2089 |
+
if colname in st.session_state["dffix_single_csv"].columns:
|
| 2090 |
+
st.session_state["dffix_single_csv"][colname] = stringify_column_keep_decimal_if_needed(st.session_state["dffix_single_csv"][colname])
|
| 2091 |
+
|
| 2092 |
else:
|
| 2093 |
with open(EXAMPLE_CUSTOM_JSON_FILE, "r") as json_file:
|
| 2094 |
json_string = json_file.read()
|
|
|
|
| 2104 |
if in_st_nn("dffix_single_csv"):
|
| 2105 |
st.dataframe(
|
| 2106 |
st.session_state["dffix_single_csv"],
|
| 2107 |
+
width='stretch',
|
| 2108 |
hide_index=True,
|
| 2109 |
on_select="ignore",
|
| 2110 |
height=200,
|
|
|
|
| 2115 |
else:
|
| 2116 |
st.dataframe(
|
| 2117 |
st.session_state["stimdf_single_csv"],
|
| 2118 |
+
width='stretch',
|
| 2119 |
hide_index=True,
|
| 2120 |
on_select="ignore",
|
| 2121 |
height=200,
|
|
|
|
| 2332 |
help="This downloads the corrected fixations dataframe as a .csv file with the filename containing the trial id.",
|
| 2333 |
)
|
| 2334 |
with single_file_tab_csv_tab.expander("Show corrected fixation data", expanded=True):
|
| 2335 |
+
st.dataframe(dffix, width='stretch', hide_index=True, height=200)
|
| 2336 |
with single_file_tab_csv_tab.expander("Show fixation plots", expanded=True):
|
| 2337 |
|
| 2338 |
plotting_checkboxes_single_single_csv = st.multiselect(
|
|
|
|
| 2350 |
to_plot_list=plotting_checkboxes_single_single_csv,
|
| 2351 |
algo_choice=st.session_state["algo_choice_single_csv"],
|
| 2352 |
),
|
| 2353 |
+
width='stretch',
|
| 2354 |
)
|
| 2355 |
+
st.plotly_chart(plot_y_corr(dffix, st.session_state["algo_choice_single_csv"]), width='stretch')
|
| 2356 |
plotlist = [x for x in dffix.columns if "Unnamed" not in str(x)]
|
| 2357 |
plot_choice = st.multiselect(
|
| 2358 |
"Which measures should be visualized?",
|
|
|
|
| 2360 |
key="plot_choice_fix_measure",
|
| 2361 |
default=plotlist[-1],
|
| 2362 |
)
|
| 2363 |
+
st.plotly_chart(plot_fix_measure(dffix, plot_choice, "Index"), width='stretch')
|
| 2364 |
|
| 2365 |
if "chars_list" in trial:
|
| 2366 |
analysis_expander_custom = single_file_tab_csv_tab.expander("Show Analysis results", True)
|
|
|
|
| 2437 |
fixations_tuples, textblock_input_dict, trial=trial, get_char_measures=False
|
| 2438 |
)
|
| 2439 |
|
| 2440 |
+
st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
|
| 2441 |
word_measures_df_csv = convert_df(word_measures_df)
|
| 2442 |
|
| 2443 |
st.download_button(
|
|
|
|
| 2453 |
st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
|
| 2454 |
|
| 2455 |
if character_measures_df is not None:
|
| 2456 |
+
st.dataframe(character_measures_df, width='stretch', hide_index=True, height=200)
|
| 2457 |
|
| 2458 |
with own_analysis_tab_custom:
|
| 2459 |
st.markdown(
|
|
|
|
| 2468 |
save_to_csv=True,
|
| 2469 |
measures_to_calculate = ALL_MEASURES_OWN
|
| 2470 |
)
|
| 2471 |
+
st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
|
| 2472 |
own_word_measures_csv = convert_df(own_word_measures)
|
| 2473 |
|
| 2474 |
+
sent_measures_single_csv = compute_sentence_measures(
|
| 2475 |
+
dffix,
|
| 2476 |
+
pd.DataFrame(trial["chars_df"]),
|
| 2477 |
+
st.session_state["algo_choice_custom_eyekit"],
|
| 2478 |
+
ALL_SENT_MEASURES,
|
| 2479 |
+
save_to_csv=True,
|
| 2480 |
+
)
|
| 2481 |
+
st.session_state["own_sent_measures_single_csv"] = sent_measures_single_csv
|
| 2482 |
+
st.dataframe(sent_measures_single_csv, width='stretch', hide_index=True, height=200)
|
| 2483 |
+
|
| 2484 |
st.download_button(
|
| 2485 |
"⏬ Download word measures data",
|
| 2486 |
own_word_measures_csv,
|
|
|
|
| 2831 |
with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
|
| 2832 |
item_colnames_markdown = read_item_col_names()
|
| 2833 |
st.markdown(item_colnames_markdown)
|
| 2834 |
+
multi_file_tab.dataframe(st.session_state["trials_df"], width='stretch', height=200)
|
| 2835 |
if in_st_nn("subjects_summary_df_multi_asc"):
|
| 2836 |
multi_file_tab.markdown("### Subject level summary statistics")
|
| 2837 |
with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
|
| 2838 |
subject_measure_colnames_markdown = read_subject_meas_col_names()
|
| 2839 |
st.markdown(subject_measure_colnames_markdown)
|
| 2840 |
multi_file_tab.dataframe(
|
| 2841 |
+
st.session_state["subjects_summary_df_multi_asc"], width='stretch', height=200
|
| 2842 |
)
|
| 2843 |
if in_st_nn("trials_summary_df_multi_asc"):
|
| 2844 |
multi_file_tab.markdown("### Trial level summary statistics")
|
|
|
|
| 2846 |
trials_colnames_markdown = read_trial_col_names()
|
| 2847 |
st.markdown(trials_colnames_markdown)
|
| 2848 |
multi_file_tab.dataframe(
|
| 2849 |
+
st.session_state["trials_summary_df_multi_asc"], width='stretch', height=200
|
| 2850 |
)
|
| 2851 |
|
| 2852 |
multi_file_tab.markdown("### Combined fixations dataframe and fixation level features")
|
| 2853 |
with multi_file_tab.popover("Column name definitions"):
|
| 2854 |
fix_colnames_markdown = get_fix_colnames_markdown()
|
| 2855 |
st.markdown(fix_colnames_markdown)
|
| 2856 |
+
multi_file_tab.dataframe(st.session_state["all_fix_dfs_concat_multi_asc"], width='stretch', height=200)
|
| 2857 |
|
| 2858 |
high_fix_count_dfs = []
|
| 2859 |
for algo_choice in st.session_state["algo_choice_multi_asc"]:
|
|
|
|
| 2883 |
multi_file_tab.markdown(
|
| 2884 |
"### Words that had a large number of fixations assigned to them and may need to be investigated"
|
| 2885 |
)
|
| 2886 |
+
multi_file_tab.dataframe(high_fix_count_dfs_cat, width='stretch', height=200)
|
| 2887 |
subs_str = "-".join([s for s in st.session_state["all_trials_by_subj"].keys()])
|
| 2888 |
high_fix_count_dfs_cat.to_csv(RESULTS_FOLDER / f"{subs_str}_words_with_many_fixations.csv")
|
| 2889 |
|
| 2890 |
if "all_correction_stats" in st.session_state:
|
| 2891 |
multi_file_tab.markdown("### Correction statistics")
|
| 2892 |
+
multi_file_tab.dataframe(st.session_state["all_correction_stats"], width='stretch', height=200)
|
| 2893 |
multi_file_tab.markdown("### Combined saccades dataframe and saccade level features")
|
| 2894 |
with multi_file_tab.popover("Column name definitions"):
|
| 2895 |
sac_colnames_markdown = get_sac_colnames_markdown()
|
| 2896 |
st.markdown(sac_colnames_markdown)
|
| 2897 |
multi_file_tab.dataframe(
|
| 2898 |
+
st.session_state["all_sacc_dfs_concat_multi_asc"], width='stretch', height=200
|
| 2899 |
)
|
| 2900 |
multi_file_tab.markdown("### Combined characters dataframe")
|
| 2901 |
with multi_file_tab.popover("Column names definitions", help="Show column names and their definitions."):
|
| 2902 |
chars_colnames_markdown = read_chars_col_names()
|
| 2903 |
st.markdown(chars_colnames_markdown)
|
| 2904 |
multi_file_tab.dataframe(
|
| 2905 |
+
st.session_state["all_chars_dfs_concat_multi_asc"], width='stretch', height=200
|
| 2906 |
)
|
| 2907 |
|
| 2908 |
if not st.session_state["all_own_word_measures_concat"].empty:
|
|
|
|
| 2911 |
word_measure_colnames_markdown = read_word_meas_col_names()
|
| 2912 |
st.markdown(word_measure_colnames_markdown)
|
| 2913 |
multi_file_tab.dataframe(
|
| 2914 |
+
st.session_state["all_own_word_measures_concat"], width='stretch', height=200
|
| 2915 |
)
|
| 2916 |
if not st.session_state["all_sentence_dfs_concat_multi_asc"].empty:
|
| 2917 |
multi_file_tab.markdown("### Combined sentence dataframe and sentence level features")
|
|
|
|
| 2919 |
sentence_measure_colnames_markdown = read_sent_meas_col_names()
|
| 2920 |
st.markdown(sentence_measure_colnames_markdown)
|
| 2921 |
multi_file_tab.dataframe(
|
| 2922 |
+
st.session_state["all_sentence_dfs_concat_multi_asc"], width='stretch', height=200
|
| 2923 |
)
|
| 2924 |
if "zipfiles_with_results" in st.session_state:
|
| 2925 |
multi_res_col1, multi_res_col2 = multi_file_tab.columns(2)
|
|
|
|
| 3013 |
to_plot_list=plotting_checkboxes_multi,
|
| 3014 |
font=selecte_plotting_font_multi_asc,
|
| 3015 |
),
|
| 3016 |
+
width='stretch',
|
| 3017 |
)
|
| 3018 |
plot_expander_multi.plotly_chart(
|
| 3019 |
+
plot_y_corr(dffix_multi, st.session_state["algo_choice_multi_asc"]), width='stretch'
|
| 3020 |
)
|
| 3021 |
|
| 3022 |
select_and_show_fix_sacc_feature_plots(
|
|
|
|
| 3088 |
fixations_tuples, textblock_input_dict, trial=trial_multi, get_char_measures=False
|
| 3089 |
)
|
| 3090 |
|
| 3091 |
+
st.dataframe(word_measures_df, width='stretch', hide_index=True, height=200)
|
| 3092 |
word_measures_df_csv = convert_df(word_measures_df)
|
| 3093 |
|
| 3094 |
st.download_button(
|
|
|
|
| 3110 |
st.image(ekm.plot_with_measure(fixations_tuples, textblock_input_dict, screen_size, measure_words))
|
| 3111 |
|
| 3112 |
if character_measures_df is not None:
|
| 3113 |
+
st.dataframe(character_measures_df, width='stretch', hide_index=True, height=200)
|
| 3114 |
|
| 3115 |
with own_analysis_tab:
|
| 3116 |
st.markdown(
|
|
|
|
| 3142 |
own_word_measures = reorder_columns(own_word_measures)
|
| 3143 |
if "question_correct" in own_word_measures.columns:
|
| 3144 |
own_word_measures = own_word_measures.drop(columns=["question_correct"])
|
| 3145 |
+
st.dataframe(own_word_measures, width='stretch', hide_index=True, height=200)
|
| 3146 |
own_word_measures_csv = convert_df(own_word_measures)
|
| 3147 |
st.download_button(
|
| 3148 |
"⏬ Download word measures data",
|
|
|
|
| 3170 |
)
|
| 3171 |
st.pyplot(own_word_measures_fig)
|
| 3172 |
st.markdown("Sentence measures")
|
| 3173 |
+
st.dataframe(sent_measures_multi, width='stretch', hide_index=True, height=200)
|
| 3174 |
|
| 3175 |
else:
|
| 3176 |
multi_file_tab.warning("🚨 Stimulus information needed for analysis 🚨")
|
|
|
|
| 3713 |
x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
|
| 3714 |
label_start="Fixation",
|
| 3715 |
),
|
| 3716 |
+
width='stretch',
|
| 3717 |
)
|
| 3718 |
sacc_feature_plot_col_single_asc.plotly_chart(
|
| 3719 |
plot_fix_measure(
|
|
|
|
| 3722 |
x_axis_selection=st.session_state[plot_choice_fix_sac_feature_x_axis_name],
|
| 3723 |
label_start="Saccade",
|
| 3724 |
),
|
| 3725 |
+
width='stretch',
|
| 3726 |
)
|
| 3727 |
|
| 3728 |
|
multi_proc_funcs.py
CHANGED
|
@@ -267,7 +267,7 @@ def set_up_models(dist_models_folder):
|
|
| 267 |
out_dict = {}
|
| 268 |
dist_models_with_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_True*.ckpt"))
|
| 269 |
dist_models_without_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_False*.ckpt"))
|
| 270 |
-
|
| 271 |
|
| 272 |
models_without_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_without_norm]
|
| 273 |
models_with_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_with_norm]
|
|
@@ -277,7 +277,10 @@ def set_up_models(dist_models_folder):
|
|
| 277 |
|
| 278 |
models_without_norm_df = [x[0] for x in models_without_norm_df if x[0] is not None]
|
| 279 |
models_with_norm_df = [x[0] for x in models_with_norm_df if x[0] is not None]
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
| 281 |
ensemble_model_avg = models.EnsembleModel(
|
| 282 |
models_without_norm_df, models_with_norm_df, learning_rate=0.0058, use_simple_average=True
|
| 283 |
)
|
|
@@ -286,9 +289,11 @@ def set_up_models(dist_models_folder):
|
|
| 286 |
out_dict["model_cfg_without_norm_df"] = model_cfg_without_norm_df
|
| 287 |
out_dict["model_cfg_with_norm_df"] = model_cfg_with_norm_df
|
| 288 |
|
| 289 |
-
single_DIST_model, single_DIST_model_cfg = find_and_load_model(model_date=
|
| 290 |
out_dict["single_DIST_model"] = single_DIST_model
|
| 291 |
out_dict["single_DIST_model_cfg"] = single_DIST_model_cfg
|
|
|
|
|
|
|
| 292 |
return out_dict
|
| 293 |
|
| 294 |
|
|
|
|
| 267 |
out_dict = {}
|
| 268 |
dist_models_with_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_True*.ckpt"))
|
| 269 |
dist_models_without_norm = list(dist_models_folder.glob("*normalize_by_line_height_and_width_False*.ckpt"))
|
| 270 |
+
dist_model_date_with_norm = dist_models_with_norm[0].stem.split("_")[1]
|
| 271 |
|
| 272 |
models_without_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_without_norm]
|
| 273 |
models_with_norm_df = [find_and_load_model(m_file.stem.split("_")[1]) for m_file in dist_models_with_norm]
|
|
|
|
| 277 |
|
| 278 |
models_without_norm_df = [x[0] for x in models_without_norm_df if x[0] is not None]
|
| 279 |
models_with_norm_df = [x[0] for x in models_with_norm_df if x[0] is not None]
|
| 280 |
+
if len(models_without_norm_df) == 0:
|
| 281 |
+
ic("No models without normalization found")
|
| 282 |
+
if len(models_with_norm_df) == 0:
|
| 283 |
+
ic("No models with normalization found")
|
| 284 |
ensemble_model_avg = models.EnsembleModel(
|
| 285 |
models_without_norm_df, models_with_norm_df, learning_rate=0.0058, use_simple_average=True
|
| 286 |
)
|
|
|
|
| 289 |
out_dict["model_cfg_without_norm_df"] = model_cfg_without_norm_df
|
| 290 |
out_dict["model_cfg_with_norm_df"] = model_cfg_with_norm_df
|
| 291 |
|
| 292 |
+
single_DIST_model, single_DIST_model_cfg = find_and_load_model(model_date=dist_model_date_with_norm)
|
| 293 |
out_dict["single_DIST_model"] = single_DIST_model
|
| 294 |
out_dict["single_DIST_model_cfg"] = single_DIST_model_cfg
|
| 295 |
+
if single_DIST_model is None:
|
| 296 |
+
ic(f"Failed to load single DIST model for {dist_model_date_with_norm}")
|
| 297 |
return out_dict
|
| 298 |
|
| 299 |
|
popEye_funcs.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
Mostly adapted from: https://github.com/sascha2schroeder/popEye
|
| 3 |
"""
|
| 4 |
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
from icecream import ic
|
|
@@ -886,11 +887,13 @@ def combine_words(fix, wordfirst, wordtmp, algo_choice, measures_to_calculate):
|
|
| 886 |
|
| 887 |
|
| 888 |
def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_to_csv=False):
|
| 889 |
-
sentitem = stimmat.drop_duplicates(
|
| 890 |
subset="in_sentence_number", keep="first"
|
| 891 |
) # TODO check why there are rows with sent number None
|
| 892 |
fixin = fix.copy().reset_index(drop=True)
|
| 893 |
-
|
|
|
|
|
|
|
| 894 |
fixin["on_sentence_num2"] = fixin[f"on_sentence_num_{algo_choice}"].copy()
|
| 895 |
|
| 896 |
# Recompute sentence number (two fixation exception rule)
|
|
@@ -1146,6 +1149,7 @@ def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_
|
|
| 1146 |
item.rename({"in_sentence_number": f"on_sentence_num_{algo_choice}"}, axis=1),
|
| 1147 |
on=f"on_sentence_num_{algo_choice}",
|
| 1148 |
how="left",
|
|
|
|
| 1149 |
)
|
| 1150 |
sent[f"skip_{algo_choice}"] = 0
|
| 1151 |
sent.loc[pd.isna(sent[f"nrun_{algo_choice}"]), f"skip_{algo_choice}"] = 1
|
|
|
|
| 2 |
Mostly adapted from: https://github.com/sascha2schroeder/popEye
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from copy import deepcopy
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
from icecream import ic
|
|
|
|
| 887 |
|
| 888 |
|
| 889 |
def compute_sentence_measures(fix, stimmat, algo_choice, measures_to_calc, save_to_csv=False):
|
| 890 |
+
sentitem = deepcopy(stimmat).drop_duplicates(
|
| 891 |
subset="in_sentence_number", keep="first"
|
| 892 |
) # TODO check why there are rows with sent number None
|
| 893 |
fixin = fix.copy().reset_index(drop=True)
|
| 894 |
+
unique_trial_ids = fixin['trial_id'].unique()
|
| 895 |
+
if len(unique_trial_ids) == 1:
|
| 896 |
+
sentitem.loc[:,'trial_id'] = unique_trial_ids[0]
|
| 897 |
fixin["on_sentence_num2"] = fixin[f"on_sentence_num_{algo_choice}"].copy()
|
| 898 |
|
| 899 |
# Recompute sentence number (two fixation exception rule)
|
|
|
|
| 1149 |
item.rename({"in_sentence_number": f"on_sentence_num_{algo_choice}"}, axis=1),
|
| 1150 |
on=f"on_sentence_num_{algo_choice}",
|
| 1151 |
how="left",
|
| 1152 |
+
suffixes=['','from_item']
|
| 1153 |
)
|
| 1154 |
sent[f"skip_{algo_choice}"] = 0
|
| 1155 |
sent.loc[pd.isna(sent[f"nrun_{algo_choice}"]), f"skip_{algo_choice}"] = 1
|
process_asc_files_in_multi_p.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
| 2 |
import json
|
|
|
|
| 3 |
from sys import platform as _platform
|
| 4 |
from functools import partial
|
| 5 |
import multiprocessing
|
|
@@ -10,6 +11,17 @@ import sys
|
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def get_cpu_count():
|
| 14 |
if os.sys.platform in ("linux", "linux2", "darwin"):
|
| 15 |
return os.cpu_count()
|
|
@@ -103,6 +115,7 @@ def main():
|
|
| 103 |
models_dict,
|
| 104 |
fix_cols_to_add_multi_asc,
|
| 105 |
) = json.loads(input_data)
|
|
|
|
| 106 |
if (
|
| 107 |
"DIST" in algo_choice
|
| 108 |
or "Wisdom_of_Crowds_with_DIST" in algo_choice
|
|
@@ -133,16 +146,21 @@ def main():
|
|
| 133 |
models_dict,
|
| 134 |
fix_cols_to_add_multi_asc,
|
| 135 |
)
|
|
|
|
| 136 |
out2 = []
|
| 137 |
for dffix, trial in out:
|
| 138 |
dffix = dffix.to_dict("records")
|
| 139 |
trial = make_json_compatible(trial)
|
| 140 |
out2.append((dffix, trial))
|
| 141 |
json_data_out = json.dumps(out2)
|
|
|
|
| 142 |
sys.stdout.flush()
|
| 143 |
print(json_data_out)
|
| 144 |
except Exception as e:
|
|
|
|
| 145 |
print(json.dumps({"error": str(e)}))
|
|
|
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
if __name__ == "__main__":
|
|
|
|
| 1 |
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
| 2 |
import json
|
| 3 |
+
import logging
|
| 4 |
from sys import platform as _platform
|
| 5 |
from functools import partial
|
| 6 |
import multiprocessing
|
|
|
|
| 11 |
import pandas as pd
|
| 12 |
|
| 13 |
|
| 14 |
+
logger = logging.getLogger('multiproc')
|
| 15 |
+
logger.propagate = False
|
| 16 |
+
logger.setLevel("DEBUG")
|
| 17 |
+
ch = logging.FileHandler("MULTI_LOG.log", "a")
|
| 18 |
+
ch.setFormatter(
|
| 19 |
+
logging.Formatter(
|
| 20 |
+
"%(asctime)s-{%(filename)s:%(lineno)d}-%(levelname)s >>> %(message)s",
|
| 21 |
+
"%m-%d %H:%M:%S",
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
logger.addHandler(ch)
|
| 25 |
def get_cpu_count():
|
| 26 |
if os.sys.platform in ("linux", "linux2", "darwin"):
|
| 27 |
return os.cpu_count()
|
|
|
|
| 115 |
models_dict,
|
| 116 |
fix_cols_to_add_multi_asc,
|
| 117 |
) = json.loads(input_data)
|
| 118 |
+
logger.info("Finished loading from json")
|
| 119 |
if (
|
| 120 |
"DIST" in algo_choice
|
| 121 |
or "Wisdom_of_Crowds_with_DIST" in algo_choice
|
|
|
|
| 146 |
models_dict,
|
| 147 |
fix_cols_to_add_multi_asc,
|
| 148 |
)
|
| 149 |
+
logger.info("Finished process_asc_files_in_multi_proc")
|
| 150 |
out2 = []
|
| 151 |
for dffix, trial in out:
|
| 152 |
dffix = dffix.to_dict("records")
|
| 153 |
trial = make_json_compatible(trial)
|
| 154 |
out2.append((dffix, trial))
|
| 155 |
json_data_out = json.dumps(out2)
|
| 156 |
+
logger.info("Finished appending")
|
| 157 |
sys.stdout.flush()
|
| 158 |
print(json_data_out)
|
| 159 |
except Exception as e:
|
| 160 |
+
logger.warning(e)
|
| 161 |
print(json.dumps({"error": str(e)}))
|
| 162 |
+
with open("MULTI_ERROR.log",'w') as f:
|
| 163 |
+
f.write(e)
|
| 164 |
|
| 165 |
|
| 166 |
if __name__ == "__main__":
|
utils.py
CHANGED
|
@@ -198,6 +198,7 @@ def calc_xdiff_ydiff(line_xcoords_no_pad, line_ycoords_no_pad, line_heights, all
|
|
| 198 |
|
| 199 |
|
| 200 |
def add_words(chars_list):
|
|
|
|
| 201 |
chars_list_reconstructed = []
|
| 202 |
words_list = []
|
| 203 |
sentence_list = []
|
|
@@ -210,6 +211,21 @@ def add_words(chars_list):
|
|
| 210 |
on_line_num = -1
|
| 211 |
line_change_on_next_char = False
|
| 212 |
num_chars = len(chars_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
for idx, char_dict in enumerate(chars_list):
|
| 214 |
# check if line change will happen after current char
|
| 215 |
on_line_num = char_dict["assigned_line"]
|
|
@@ -222,23 +238,18 @@ def add_words(chars_list):
|
|
| 222 |
word_xmin = chars_list_reconstructed[word_start_idx]["char_xmin"]
|
| 223 |
if chars_list_reconstructed[-1]["char"] == " " and len(chars_list_reconstructed) != 1:
|
| 224 |
word_xmax = chars_list_reconstructed[-2]["char_xmax"]
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
for idx in range(word_start_idx, len(chars_list_reconstructed) - 1)
|
| 230 |
-
]
|
| 231 |
-
)
|
| 232 |
elif len(chars_list_reconstructed) == 1:
|
| 233 |
word_xmax = chars_list_reconstructed[-1]["char_xmax"]
|
| 234 |
word = " "
|
| 235 |
else:
|
| 236 |
-
word = "".join(
|
| 237 |
-
[
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
]
|
| 241 |
-
)
|
| 242 |
word_xmax = chars_list_reconstructed[-1]["char_xmax"]
|
| 243 |
word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
|
| 244 |
word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
|
|
@@ -269,10 +280,9 @@ def add_words(chars_list):
|
|
| 269 |
char_dict["in_word_number"] = len(words_list) - 1
|
| 270 |
char_dict["in_word"] = word
|
| 271 |
char_dict["num_letters_from_start_of_word"] = cidx
|
| 272 |
-
|
| 273 |
word_start_idx = idx + 1
|
| 274 |
-
|
| 275 |
-
if chars_list_reconstructed
|
| 276 |
if idx != sentence_start_idx:
|
| 277 |
chars_df_temp = pd.DataFrame(chars_list_reconstructed[sentence_start_idx:])
|
| 278 |
line_texts = []
|
|
@@ -301,7 +311,6 @@ def add_words(chars_list):
|
|
| 301 |
):
|
| 302 |
char_dict["in_word_number"] = chars_list_reconstructed[cidx + 1]["in_word_number"]
|
| 303 |
char_dict["in_word"] = chars_list_reconstructed[cidx + 1]["in_word"]
|
| 304 |
-
|
| 305 |
last_letter_in_word = words_list[-1]["word"][-1]
|
| 306 |
last_letter_in_chars_list_reconstructed = char_dict["char"]
|
| 307 |
if last_letter_in_word != last_letter_in_chars_list_reconstructed:
|
|
@@ -316,7 +325,6 @@ def add_words(chars_list):
|
|
| 316 |
word_ymax=words_list[-1]["word_ymax"],
|
| 317 |
assigned_line=assigned_line,
|
| 318 |
)
|
| 319 |
-
|
| 320 |
word_x_center = round(
|
| 321 |
(words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
|
| 322 |
)
|
|
@@ -347,7 +355,24 @@ def add_words(chars_list):
|
|
| 347 |
chars_list_reconstructed[-1]["in_sentence"] = sentence_list[-1]["sentence_text"]
|
| 348 |
else:
|
| 349 |
ic(f"Warning Sentence list empty: {sentence_list}")
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
return words_list, chars_list_reconstructed
|
| 352 |
|
| 353 |
|
|
@@ -1520,6 +1545,7 @@ def download_example_ascs(EXAMPLES_FOLDER, EXAMPLES_ASC_ZIP_FILENAME, OSF_DOWNLA
|
|
| 1520 |
ic(f"Extracting {EXAMPLES_ASC_ZIP_FILENAME} failed")
|
| 1521 |
|
| 1522 |
EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
|
|
|
|
| 1523 |
else:
|
| 1524 |
EXAMPLE_ASC_FILES = []
|
| 1525 |
return EXAMPLE_ASC_FILES
|
|
|
|
| 198 |
|
| 199 |
|
| 200 |
def add_words(chars_list):
|
| 201 |
+
|
| 202 |
chars_list_reconstructed = []
|
| 203 |
words_list = []
|
| 204 |
sentence_list = []
|
|
|
|
| 211 |
on_line_num = -1
|
| 212 |
line_change_on_next_char = False
|
| 213 |
num_chars = len(chars_list)
|
| 214 |
+
# Helper: is this char a sentence boundary?
|
| 215 |
+
def is_sentence_boundary(idx):
|
| 216 |
+
c = chars_list_reconstructed[idx]["char"]
|
| 217 |
+
if c not in ".!?":
|
| 218 |
+
return False
|
| 219 |
+
# Check if previous and next chars are digits (for numbers like 41.2)
|
| 220 |
+
prev_c = chars_list_reconstructed[idx-1]["char"] if idx > 0 else None
|
| 221 |
+
if prev_c == chars_list[idx-1]["char"]:
|
| 222 |
+
next_c = chars_list[idx+1]["char"] if idx+1 < len(chars_list) else None
|
| 223 |
+
if prev_c and prev_c.isdigit() and next_c and next_c.isdigit():
|
| 224 |
+
return False
|
| 225 |
+
# Optionally: only treat as boundary if followed by space or end
|
| 226 |
+
if next_c and next_c not in [None, " ", "\n"]:
|
| 227 |
+
return False
|
| 228 |
+
return True
|
| 229 |
for idx, char_dict in enumerate(chars_list):
|
| 230 |
# check if line change will happen after current char
|
| 231 |
on_line_num = char_dict["assigned_line"]
|
|
|
|
| 238 |
word_xmin = chars_list_reconstructed[word_start_idx]["char_xmin"]
|
| 239 |
if chars_list_reconstructed[-1]["char"] == " " and len(chars_list_reconstructed) != 1:
|
| 240 |
word_xmax = chars_list_reconstructed[-2]["char_xmax"]
|
| 241 |
+
word = "".join([
|
| 242 |
+
chars_list_reconstructed[idx]["char"]
|
| 243 |
+
for idx in range(word_start_idx, len(chars_list_reconstructed) - 1)
|
| 244 |
+
])
|
|
|
|
|
|
|
|
|
|
| 245 |
elif len(chars_list_reconstructed) == 1:
|
| 246 |
word_xmax = chars_list_reconstructed[-1]["char_xmax"]
|
| 247 |
word = " "
|
| 248 |
else:
|
| 249 |
+
word = "".join([
|
| 250 |
+
chars_list_reconstructed[idx]["char"]
|
| 251 |
+
for idx in range(word_start_idx, len(chars_list_reconstructed))
|
| 252 |
+
])
|
|
|
|
|
|
|
| 253 |
word_xmax = chars_list_reconstructed[-1]["char_xmax"]
|
| 254 |
word_ymin = chars_list_reconstructed[word_start_idx]["char_ymin"]
|
| 255 |
word_ymax = chars_list_reconstructed[word_start_idx]["char_ymax"]
|
|
|
|
| 280 |
char_dict["in_word_number"] = len(words_list) - 1
|
| 281 |
char_dict["in_word"] = word
|
| 282 |
char_dict["num_letters_from_start_of_word"] = cidx
|
|
|
|
| 283 |
word_start_idx = idx + 1
|
| 284 |
+
# Improved sentence boundary detection
|
| 285 |
+
if is_sentence_boundary(len(chars_list_reconstructed)-1) or idx == (len(chars_list) - 1):
|
| 286 |
if idx != sentence_start_idx:
|
| 287 |
chars_df_temp = pd.DataFrame(chars_list_reconstructed[sentence_start_idx:])
|
| 288 |
line_texts = []
|
|
|
|
| 311 |
):
|
| 312 |
char_dict["in_word_number"] = chars_list_reconstructed[cidx + 1]["in_word_number"]
|
| 313 |
char_dict["in_word"] = chars_list_reconstructed[cidx + 1]["in_word"]
|
|
|
|
| 314 |
last_letter_in_word = words_list[-1]["word"][-1]
|
| 315 |
last_letter_in_chars_list_reconstructed = char_dict["char"]
|
| 316 |
if last_letter_in_word != last_letter_in_chars_list_reconstructed:
|
|
|
|
| 325 |
word_ymax=words_list[-1]["word_ymax"],
|
| 326 |
assigned_line=assigned_line,
|
| 327 |
)
|
|
|
|
| 328 |
word_x_center = round(
|
| 329 |
(words_list[-1]["word_xmax"] - words_list[-1]["word_xmin"]) / 2 + words_list[-1]["word_xmin"], ndigits=2
|
| 330 |
)
|
|
|
|
| 355 |
chars_list_reconstructed[-1]["in_sentence"] = sentence_list[-1]["sentence_text"]
|
| 356 |
else:
|
| 357 |
ic(f"Warning Sentence list empty: {sentence_list}")
|
| 358 |
+
# Add 'in_sentence' and 'in_sentence_number' keys to words_list using 'in_word_number' from chars_list_reconstructed
|
| 359 |
+
# Build a mapping from in_word_number to (in_sentence, in_sentence_number)
|
| 360 |
+
word_to_sentence = {}
|
| 361 |
+
word_to_sentence_number = {}
|
| 362 |
+
for char in chars_list_reconstructed:
|
| 363 |
+
if "in_word_number" in char and "in_sentence" in char and "in_sentence_number" in char:
|
| 364 |
+
word_to_sentence[char["in_word_number"]] = char["in_sentence"]
|
| 365 |
+
word_to_sentence_number[char["in_word_number"]] = char["in_sentence_number"]
|
| 366 |
+
for word in words_list:
|
| 367 |
+
word_num = word.get("word_number")
|
| 368 |
+
if word_num in word_to_sentence:
|
| 369 |
+
word["in_sentence"] = word_to_sentence[word_num]
|
| 370 |
+
else:
|
| 371 |
+
word["in_sentence"] = None
|
| 372 |
+
if word_num in word_to_sentence_number:
|
| 373 |
+
word["in_sentence_number"] = word_to_sentence_number[word_num]
|
| 374 |
+
else:
|
| 375 |
+
word["in_sentence_number"] = None
|
| 376 |
return words_list, chars_list_reconstructed
|
| 377 |
|
| 378 |
|
|
|
|
| 1545 |
ic(f"Extracting {EXAMPLES_ASC_ZIP_FILENAME} failed")
|
| 1546 |
|
| 1547 |
EXAMPLE_ASC_FILES = [x for x in EXAMPLES_FOLDER_PATH.glob("*.asc")]
|
| 1548 |
+
|
| 1549 |
else:
|
| 1550 |
EXAMPLE_ASC_FILES = []
|
| 1551 |
return EXAMPLE_ASC_FILES
|