import streamlit as st import numpy as np import pandas as pd from AutoVisualizer.processing import check_dataset_cleanliness, task_type, is_probably_categorical, is_discrete, is_continuous, parse_datetime_columns from AutoVisualizer.categorical_viz import combine_figures_as_subplots, generate_count_plots, generate_bar_plots, generate_grouped_bar_plots, generate_pie_plots, generate_categorical_correlation_heatmap from AutoVisualizer.numerical_viz import generate_box_plots, generate_numeric_correlation_heatmap, generate_scatter_plots, generate_histograms, generate_line_plots st.set_page_config(page_title="Auto-Visualizer", page_icon="📊", layout="wide") # Initialize session state for storing plots if 'plots_generated' not in st.session_state: st.session_state.plots_generated = False st.session_state.all_plots = { 'count_plots': [], 'bar_plots': [], 'grp_bar_plots': [], 'pie_plots': [], 'box_plots': [], 'heat_maps': [], 'scatter_plots': [], 'histograms': [], 'line_plots': [] } with st.sidebar: # Upload the dataset file uploaded_file = st.file_uploader("Upload your dataset file:", ["csv", "xlsx", "json", "xml"]) if uploaded_file is not None: file_type = uploaded_file.name try: # Read the dataset through pandas if file_type.endswith(".csv"): df = pd.read_csv(uploaded_file) elif file_type.endswith(".xlsx"): df = pd.read_excel(uploaded_file) elif file_type.endswith(".json"): df = pd.read_json(uploaded_file) else: df = pd.read_xml(uploaded_file) except Exception as e: st.write("Error:", e) with st.sidebar: st.info(""" ⚠️ **Heads up!** For the best experience, please upload a clean dataset. This app is designed for *visualizing data*, not cleaning it. 📌 *Tip:* Use the quick checker below to spot potential issues. """) if st.button("Run Cleanliness Check"): st.session_state.run_clean_check = True st.divider() if st.session_state.get("run_clean_check", False): with st.expander("➡️ See Cleanliness Checker Result"): check_dataset_cleanliness(df) st.markdown("Your Dataset:") st.dataframe(df, height=210) st.divider() feature_list = list(df.columns) target_selector = ["No Target"] + feature_list with st.sidebar: target_col = st.selectbox("Specify the target column in your dataset:", target_selector) task = task_type(df, target_col) st.write(f"🔍 Task identified: **{task}**") df, date_time_ls, extracted_datetime = parse_datetime_columns(df) feature_list = [x for x in feature_list if x not in date_time_ls] categorical_ls = [] discrete_ls = [] continuous_ls = [] for feature in feature_list: if is_probably_categorical(df[feature]): categorical_ls.append(feature) elif is_discrete(df[feature]): discrete_ls.append(feature) elif is_continuous(df[feature]): continuous_ls.append(feature) for feature in extracted_datetime: if is_probably_categorical(df[feature]): categorical_ls.append(feature) @st.dialog("Identified/Extracted Features from your Dataset:-") def open_dialog(): if categorical_ls: with st.popover("Categorical Features", use_container_width=True): st.code("\n".join([f"• {item}" for item in categorical_ls])) if discrete_ls: with st.popover("Discrete Features", use_container_width=True): st.code("\n".join([f"• {item}" for item in discrete_ls])) if continuous_ls: with st.popover("Continuous Features", use_container_width=True): st.code("\n".join([f"• {item}" for item in continuous_ls])) if date_time_ls: with st.popover("Date-Time Features", use_container_width=True): st.code("\n".join([f"• {item}" for item in date_time_ls])) with st.popover("Extracted features from your Date-Time like features", use_container_width=True): st.code("\n".join([f"• {item}" for item in extracted_datetime])) with st.sidebar: if st.button("See Your Feature Details"): open_dialog() # Generate all plots in background when button is clicked if st.button("Generate All Plots") or st.session_state.plots_generated: if not st.session_state.plots_generated: with st.spinner("Generating all plots (please wait)..."): # Generate and store all plots if categorical_ls: st.session_state.all_plots['count_plots'] = [p for x_col in categorical_ls if df[x_col].nunique() <= 20 for p in generate_count_plots(df, x_col)] st.session_state.all_plots['bar_plots'] = [p for x_col in categorical_ls if df[x_col].nunique() <= 20 for p in generate_bar_plots(df, x_col, discrete_ls + continuous_ls)] st.session_state.all_plots['grp_bar_plots'] = generate_grouped_bar_plots(df, categorical_ls, discrete_ls + continuous_ls) st.session_state.all_plots['pie_plots'] = [p for x_col in categorical_ls if df[x_col].nunique() <= 20 for p in generate_pie_plots(df, x_col)] if continuous_ls: st.session_state.all_plots['box_plots'] = [p for x_col in categorical_ls if df[x_col].nunique() <= 10 for p in generate_box_plots(df, x_col, continuous_ls)] st.session_state.all_plots['heat_maps'] = [] if task == 'Regression' and categorical_ls: st.session_state.all_plots['heat_maps'].extend(generate_categorical_correlation_heatmap(df, target_col, categorical_ls)) st.session_state.all_plots['heat_maps'].extend(generate_numeric_correlation_heatmap(df[continuous_ls])) if len(continuous_ls) >= 2: feature_pairs = [(continuous_ls[i], continuous_ls[j]) for i in range(len(continuous_ls)) for j in range(i + 1, len(continuous_ls))] selection = st.session_state.get('selection', categorical_ls[0] if categorical_ls else None) st.session_state.all_plots['scatter_plots'] = generate_scatter_plots(df, feature_pairs, selection) st.session_state.all_plots['histograms'] = generate_histograms(df, continuous_ls) if date_time_ls: date_related_keywords = ['_year', '_month', '_day', '_weekday'] date_component_cols = [col for col in extracted_datetime if any(key in col for key in date_related_keywords)] if date_component_cols: time_choice = st.session_state.get('time_choice', 'Monthly') time_grouping_options = {"Daily": "D", "Weekly": "W", "Monthly": "ME", "Yearly": "YE"} selected_freq = time_grouping_options.get(time_choice, "ME") st.session_state.all_plots['line_plots'] = generate_line_plots(df, date_component_cols, continuous_ls, selected_freq) st.session_state.plots_generated = True st.rerun() # Refresh to display all plots # Display all plots after generation is complete if st.session_state.plots_generated: if categorical_ls: st.header("📊 Categorical Plots") if st.session_state.all_plots['count_plots']: st.subheader("Count Plots :-") st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['count_plots']), use_container_width=True) if st.session_state.all_plots['bar_plots']: st.subheader("Bar Plots :-") st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['bar_plots']), use_container_width=True) if st.session_state.all_plots['grp_bar_plots']: st.subheader("Grouped Bar Plots :-") st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['grp_bar_plots']), use_container_width=True) if st.session_state.all_plots['pie_plots']: st.subheader("Pie Charts :-") st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['pie_plots']), use_container_width=True) if continuous_ls: st.header("📊 Numerical Plots") if st.session_state.all_plots['box_plots']: st.subheader("Box Plots :-") st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['box_plots']), use_container_width=True) if st.session_state.all_plots['heat_maps']: st.subheader("Heat Maps :-") st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['heat_maps']), use_container_width=True) if len(continuous_ls) >= 2 and st.session_state.all_plots['scatter_plots']: st.subheader("Scatter Plots") selection = st.pills("Highlight using a categorical feature :- ", categorical_ls, key='selection') st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['scatter_plots']), use_container_width=True) if st.session_state.all_plots['histograms']: st.subheader("Histograms") st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['histograms']), use_container_width=True) if date_time_ls and st.session_state.all_plots['line_plots']: st.subheader("Line Plots :-") time_choice = st.pills("Choose time interval for grouping :- ", ["Daily", "Weekly", "Monthly", "Yearly"], key='time_choice') st.plotly_chart(combine_figures_as_subplots(st.session_state.all_plots['line_plots']), use_container_width=True)