Spaces:

uc-ctds
/

GDC-Cohort-Copilot

Running on Zero

App Files Files Community

songs1 commited on Oct 27, 2025

Commit

1477c63

verified ·

1 Parent(s): 12e9fbb

add user feedback system (#3)

Browse files

- add user feedback system (c435b2e30ec6086adad543d3f083fe3b830b9e66)

Files changed (3) hide show

app.py +127 -35
scheduler.py +136 -0
style.css +8 -0

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import os
 from collections import defaultdict
 import gradio as gr
 import requests
@@ -13,6 +15,7 @@ from guidance.models import Transformers
 from transformers import AutoTokenizer, GPT2LMHeadModel, set_seed
 from schema import GDCCohortSchema  # isort: skip
 EXAMPLE_INPUTS = [
     "bam files for TCGA-BRCA",
@@ -23,7 +26,8 @@ EXAMPLE_INPUTS = [
 GDC_CASES_API_ENDPOINT = "https://api.gdc.cancer.gov/cases"
 MODEL_NAME = "uc-ctds/gdc-cohort-llm-gpt2-s1M"
 TOKENIZER_NAME = MODEL_NAME
-AUTH_TOKEN = os.environ.get("HF_TOKEN", False)  # HF_TOKEN must be set to use auth
 with open("config.yaml", "r") as f:
     CONFIG = yaml.safe_load(f)
@@ -45,9 +49,23 @@ FACETS_STR = ",".join(
     ]
 )
-tok = AutoTokenizer.from_pretrained(TOKENIZER_NAME, token=AUTH_TOKEN)
-model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, token=AUTH_TOKEN)
 model = model.to("cuda" if torch.cuda.is_available() else "cpu")
 model = model.eval()
@@ -81,11 +99,11 @@ def generate_filter(query: str) -> str:
     return cohort_filter
-def _prepare_value_count(value, count):
     return f"{value} [{count}]"
-def _get_base_value(value_count):
     value = value_count
     if " [" in value:
         value = value[: value.rfind(" [")]
@@ -183,7 +201,7 @@ def _convert_cohort_filter_to_active_selections(cohort_filter: str) -> list[str]
                 active_choices.append(f"{card_name.upper()}: {value}")
         elif isinstance(default_values, dict):
             # range-slider, maybe other options in the future?
-            assert default_values["type"] == "range", f"Expected range slider for card {card_name}" # fmt: skip
             assert isinstance(values, int), "values should be integer for range op"
             if ">=" in field:
                 if values != default_values["min"]:
@@ -281,9 +299,13 @@ def _convert_cohort_filter_to_cards(cohort_filter: str, api_data: dict) -> list[
     return card_updates
-def update_elements_from_filtered_api_call(cohort_filter: str):
     # return updates for:
-    # counter (text), active selections (checkbox group), cards (list of checkbox group)
     # --- Execute API Call ---
     patched_cohort_filter = _patch_range_filters_for_facet_endpoint(cohort_filter)
@@ -309,10 +331,12 @@ def update_elements_from_filtered_api_call(cohort_filter: str):
     return [
         gr.update(value=f"{case_count} Cases"),  # case counter
         gr.update(choices=active_choices, value=active_choices),  # actives
     ] + card_updates
-def update_json_from_cards(*selected_filters_per_card):
     ops = []
     for card_name, selected_filters in zip(CARD_NAMES, selected_filters_per_card):
         # use the default values to determine card type (checkbox, range, etc)
@@ -368,7 +392,7 @@ def update_json_from_cards(*selected_filters_per_card):
     return gr.update(value=filter_json)
-def update_json_from_active(active_selections: list[str]):
     grouped_selections = defaultdict(list)
     for k_v in active_selections:
         idx = k_v.find(": ")
@@ -431,11 +455,11 @@ def update_json_from_active(active_selections: list[str]):
     return update_json_from_cards(*selected_filters_per_card)
-def get_default_filter():
     return json.dumps({"op": "and", "content": []}, indent=4)
-def set_active_tab(selected_tab):
     visibles = [gr.update(visible=(tab == selected_tab)) for tab in TAB_NAMES]
     elem_classes = [
         gr.update(variant="primary" if tab == selected_tab else "secondary")
@@ -444,6 +468,37 @@ def set_active_tab(selected_tab):
     return visibles + elem_classes
 DOWNLOAD_CASES_JS = f"""
 function download_cases(filter_str) {{
     const params = new URLSearchParams();
@@ -486,7 +541,7 @@ function download_cases(filter_str) {{
 """
 with gr.Blocks(css_paths="style.css") as demo:
-    gr.Markdown("# GDC Cohort Copilot - UNDER CONSTRUCTION")
     with gr.Row(equal_height=True):
         with gr.Column(scale=7):
@@ -517,21 +572,46 @@ with gr.Blocks(css_paths="style.css") as demo:
             )
     with gr.Row(equal_height=True):
-        with gr.Column(scale=1, min_width=250):
             gr.Examples(
                 examples=EXAMPLE_INPUTS,
                 inputs=text_input,
             )
-        with gr.Column(scale=4):
             json_output = gr.Code(
                 label="Cohort Filter JSON",
-                # value=json.dumps({"op": "and", "content": []}, indent=4),
                 language="json",
                 interactive=False,
                 show_label=True,
                 container=True,
                 elem_id="json-output",
             )
     with gr.Row(equal_height=True):
         with gr.Column(scale=1, min_width=250):
@@ -544,14 +624,6 @@ with gr.Blocks(css_paths="style.css") as demo:
                 elem_id="active-selections",
             )
-    with gr.Row():
-        gr.Markdown(
-            "The generated cohort filter will autopopulate into the filter cards below. "
-            "**GDC Cohort Copilot can make mistakes!** "
-            "Refine your search using the interactive checkboxes. "
-            "Note that many other options can be found by selecting the different tabs on the left."
-        )
     with gr.Row():
         # Tab selectors
         tab_buttons = []
@@ -613,7 +685,8 @@ with gr.Blocks(css_paths="style.css") as demo:
             fn=set_active_tab,
             inputs=gr.State(name),
             outputs=tab_containers + tab_buttons,
-            api_name=False,
         )
     # Callback for case download button
@@ -621,11 +694,29 @@ with gr.Blocks(css_paths="style.css") as demo:
         fn=None,  # apparently this isn't the same as not specifying it, even though the default is None?
         js=DOWNLOAD_CASES_JS,  # need custom JSON to execute browser side download
         inputs=json_output,
-        api_name=False,
     )
     # Model generation should change the JSON filter
     # All other element updates cascade
     text_input.submit(
         fn=generate_filter,
         inputs=text_input,
@@ -640,16 +731,16 @@ with gr.Blocks(css_paths="style.css") as demo:
                 fn=update_json_from_cards,
                 inputs=filter_cards,
                 outputs=json_output,
-                # api_name=False,  # this breaks the API functionality, not sure why
-                show_api=False,  # so just hide the API endpoints instead, not ideal
             )
         else:
             filter_card.input(
                 fn=update_json_from_cards,
                 inputs=filter_cards,
                 outputs=json_output,
-                # api_name=False,  # this breaks the API functionality, not sure why
-                show_api=False,  # so just hide the API endpoints instead, not ideal
             )
     # Changing the active selections should change the JSON filter
@@ -658,17 +749,17 @@ with gr.Blocks(css_paths="style.css") as demo:
         fn=update_json_from_active,
         inputs=active_selections,
         outputs=json_output,
-        # api_name=False,  # this breaks the API functionality, not sure why
-        show_api=False,  # so just hide the API endpoints instead, not ideal
     )
     # JSON filter change executes API call and updates all elements
     json_output.change(
         fn=update_elements_from_filtered_api_call,
         inputs=json_output,
-        outputs=[case_counter, active_selections] + filter_cards,
-        # api_name=False,  # this breaks the API functionality, not sure why
-        show_api=False,  # so just hide the API endpoints instead, not ideal
     )
     # Trigger initial update
@@ -678,6 +769,7 @@ with gr.Blocks(css_paths="style.css") as demo:
         outputs=json_output,
         # api_name=False,  # this breaks the API functionality, not sure why
         show_api=False,  # so just hide the API endpoints instead, not ideal
     )
 if __name__ == "__main__":

 import json
 import os
 from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
 import gradio as gr
 import requests
 from transformers import AutoTokenizer, GPT2LMHeadModel, set_seed
 from schema import GDCCohortSchema  # isort: skip
+from scheduler import ParquetScheduler  # isort: skip
 EXAMPLE_INPUTS = [
     "bam files for TCGA-BRCA",
 GDC_CASES_API_ENDPOINT = "https://api.gdc.cancer.gov/cases"
 MODEL_NAME = "uc-ctds/gdc-cohort-llm-gpt2-s1M"
 TOKENIZER_NAME = MODEL_NAME
+MODEL_READ_TOKEN = os.environ.get("MODEL_READ_TOKEN", None)
+DATASET_WRITE_TOKEN = os.environ.get("DATASET_WRITE_TOKEN", None)
 with open("config.yaml", "r") as f:
     CONFIG = yaml.safe_load(f)
     ]
 )
+PREF_DS = os.environ.get("PREF_DS", False)
+if PREF_DS:
+    assert DATASET_WRITE_TOKEN is not None
+    scheduler = ParquetScheduler(
+        repo_id=PREF_DS,
+        token=DATASET_WRITE_TOKEN,
+        schema={
+            "prompt": {"_type": "Value", "dtype": "string"},
+            "cohort_filter": {"_type": "Value", "dtype": "string"},
+            "preference": {"_type": "Value", "dtype": "bool"},
+            "timestamp": {"_type": "Value", "dtype": "string"},
+        },
+    )
+tok = AutoTokenizer.from_pretrained(TOKENIZER_NAME, token=MODEL_READ_TOKEN)
+model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, token=MODEL_READ_TOKEN)
 model = model.to("cuda" if torch.cuda.is_available() else "cpu")
 model = model.eval()
     return cohort_filter
+def _prepare_value_count(value: str, count: int) -> str:
     return f"{value} [{count}]"
+def _get_base_value(value_count: str) -> str:
     value = value_count
     if " [" in value:
         value = value[: value.rfind(" [")]
                 active_choices.append(f"{card_name.upper()}: {value}")
         elif isinstance(default_values, dict):
             # range-slider, maybe other options in the future?
+            assert default_values["type"] == "range", f"Expected range slider for card {card_name}"  # fmt: skip
             assert isinstance(values, int), "values should be integer for range op"
             if ">=" in field:
                 if values != default_values["min"]:
     return card_updates
+def update_elements_from_filtered_api_call(cohort_filter: str) -> list[dict]:
     # return updates for:
+    # - counter (text)
+    # - active selections (checkbox group)
+    # - upvote (enable button, reset text)
+    # - downvote (enable button, reset text)
+    # - cards (list of checkbox group)
     # --- Execute API Call ---
     patched_cohort_filter = _patch_range_filters_for_facet_endpoint(cohort_filter)
     return [
         gr.update(value=f"{case_count} Cases"),  # case counter
         gr.update(choices=active_choices, value=active_choices),  # actives
+        gr.update(interactive=True, value="⬆"),
+        gr.update(interactive=True, value="⬇"),
     ] + card_updates
+def update_json_from_cards(*selected_filters_per_card: tuple[str]) -> str:
     ops = []
     for card_name, selected_filters in zip(CARD_NAMES, selected_filters_per_card):
         # use the default values to determine card type (checkbox, range, etc)
     return gr.update(value=filter_json)
+def update_json_from_active(active_selections: list[str]) -> str:
     grouped_selections = defaultdict(list)
     for k_v in active_selections:
         idx = k_v.find(": ")
     return update_json_from_cards(*selected_filters_per_card)
+def get_default_filter() -> str:
     return json.dumps({"op": "and", "content": []}, indent=4)
+def set_active_tab(selected_tab: str) -> list[dict]:
     visibles = [gr.update(visible=(tab == selected_tab)) for tab in TAB_NAMES]
     elem_classes = [
         gr.update(variant="primary" if tab == selected_tab else "secondary")
     return visibles + elem_classes
+def save_user_preference(cohort_query: str, cohort_filter: str, preference: bool) -> list[dict]:  # fmt: skip
+    timestamp = datetime.now(timezone.utc).isoformat()
+    data = {
+        "prompt": cohort_query,
+        "cohort_filter": json.dumps(json.loads(cohort_filter)),  # remove whitespace
+        "preference": preference,
+        "timestamp": timestamp,
+    }
+    if PREF_DS:
+        scheduler.append(data)
+        print(f"Logged user preference data at {timestamp}")
+    else:
+        print(
+            f"No preference dataset configured, "
+            f"set PREF_DS env var to point to a HuggingFace Dataset Repo. "
+            f"Would have logged {data}"
+        )
+    # disable buttons
+    if preference:
+        upval = "✓"
+        downval = "--"  # whitespace seems to be escaped by gradio
+    else:
+        upval = "--"  # whitespace seems to be escaped by gradio
+        downval = "✗"
+    return [
+        gr.update(interactive=False, value=upval),
+        gr.update(interactive=False, value=downval),
+    ]
 DOWNLOAD_CASES_JS = f"""
 function download_cases(filter_str) {{
     const params = new URLSearchParams();
 """
 with gr.Blocks(css_paths="style.css") as demo:
+    gr.Markdown("# GDC Cohort Copilot")
     with gr.Row(equal_height=True):
         with gr.Column(scale=7):
             )
     with gr.Row(equal_height=True):
+        with gr.Column(scale=2, min_width=250):
             gr.Examples(
                 examples=EXAMPLE_INPUTS,
                 inputs=text_input,
             )
+        with gr.Column(scale=7):
             json_output = gr.Code(
                 label="Cohort Filter JSON",
                 language="json",
                 interactive=False,
                 show_label=True,
                 container=True,
                 elem_id="json-output",
             )
+        with gr.Column(scale=1, min_width=50):
+            gr.Markdown(
+                "Is this correct?",
+                elem_id="vote-label",
+            )
+            upvote = gr.Button(
+                value="⬆",
+                min_width=50,
+                elem_id="upvote-btn",
+            )
+            downvote = gr.Button(
+                value="⬇",
+                min_width=50,
+                elem_id="download-btn",
+            )
+    with gr.Row():
+        gr.Markdown(
+            "The generated cohort filter will autopopulate into the filter cards below. "
+            "**<u>GDC Cohort Copilot can make mistakes!</u>** "
+            "Refine your search using the interactive checkboxes. "
+            "Note that many other options can be found by selecting the different tabs. "
+            "**<u>If you'd like to help us improve our model</u>**, you can use the up or down vote button to send us feedback. "
+            "We'll only save the current free text description, the cohort filter JSON, and your vote. "
+            "You can also show us what the right filter should have been by manually refining it using the checkboxes, before up voting."
+        )
     with gr.Row(equal_height=True):
         with gr.Column(scale=1, min_width=250):
                 elem_id="active-selections",
             )
     with gr.Row():
         # Tab selectors
         tab_buttons = []
             fn=set_active_tab,
             inputs=gr.State(name),
             outputs=tab_containers + tab_buttons,
+            # api_name=False,
+            show_api=False,
         )
     # Callback for case download button
         fn=None,  # apparently this isn't the same as not specifying it, even though the default is None?
         js=DOWNLOAD_CASES_JS,  # need custom JSON to execute browser side download
         inputs=json_output,
+        # api_name=False,
+        show_api=False,
+    )
+    # Enable user preference logging
+    upvote.click(
+        fn=save_user_preference,
+        inputs=[text_input, json_output, gr.State(True)],
+        outputs=[upvote, downvote],
+        # api_name=False,
+        show_api=False,
+    )
+    downvote.click(
+        fn=save_user_preference,
+        inputs=[text_input, json_output, gr.State(False)],
+        outputs=[upvote, downvote],
+        # api_name=False,
+        show_api=False,
     )
     # Model generation should change the JSON filter
     # All other element updates cascade
+    # This is the only API that should be exposed
     text_input.submit(
         fn=generate_filter,
         inputs=text_input,
                 fn=update_json_from_cards,
                 inputs=filter_cards,
                 outputs=json_output,
+                # api_name=False,
+                show_api=False,
             )
         else:
             filter_card.input(
                 fn=update_json_from_cards,
                 inputs=filter_cards,
                 outputs=json_output,
+                # api_name=False,
+                show_api=False,
             )
     # Changing the active selections should change the JSON filter
         fn=update_json_from_active,
         inputs=active_selections,
         outputs=json_output,
+        # api_name=False,
+        show_api=False,
     )
     # JSON filter change executes API call and updates all elements
     json_output.change(
         fn=update_elements_from_filtered_api_call,
         inputs=json_output,
+        outputs=[case_counter, active_selections, upvote, downvote] + filter_cards,
+        # api_name=False,
+        show_api=False,
     )
     # Trigger initial update
         outputs=json_output,
         # api_name=False,  # this breaks the API functionality, not sure why
         show_api=False,  # so just hide the API endpoints instead, not ideal
+        # the weirdness with the API toggle seems true for all disabled API endpoints
     )
 if __name__ == "__main__":

scheduler.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Taken from https://huggingface.co/spaces/Wauplin/space_to_dataset_saver
+# which was from https://huggingface.co/spaces/hysts-samples/save-user-preferences
+# Credits to @@hysts and @@Wauplin
+import json
+import tempfile
+import uuid
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+import pyarrow as pa
+import pyarrow.parquet as pq
+from huggingface_hub import CommitScheduler
+from huggingface_hub.hf_api import HfApi
+class ParquetScheduler(CommitScheduler):
+    """
+    Usage: configure the scheduler with a repo id.
+    Once started, you can add data to be uploaded to the Hub.
+    Each `.append` call will result in a new row in your final dataset.
+    The scheduler requires you manually set the schema (read [the docs](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value) for the list of possible values):
+    ```py
+    # Start scheduler
+    >>> scheduler = ParquetScheduler(
+    ...     repo_id="my-org/my-dataset",
+    ...     schema={
+    ...         "prompt": {"_type": "Value", "dtype": "string"},
+    ...         "cohort_filter": {"_type": "Value", "dtype": "string"},
+    ...         "preference": {"_type": "Value", "dtype": "bool"},
+    ...         "timestamp": {"_type": "Value", "dtype": "string"},
+    ...     },
+    ... )
+    # Append some data to be uploaded
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    ```
+    """
+    def __init__(
+        self,
+        *,
+        repo_id: str,
+        schema: Optional[Dict[str, Dict[str, str]]] = None,
+        every: Union[int, float] = 5,
+        path_in_repo: Optional[str] = "data",
+        repo_type: Optional[str] = "dataset",
+        revision: Optional[str] = None,
+        private: bool = False,
+        token: Optional[str] = None,
+        allow_patterns: Union[List[str], str, None] = None,
+        ignore_patterns: Union[List[str], str, None] = None,
+        hf_api: Optional[HfApi] = None,
+    ) -> None:
+        super().__init__(
+            repo_id=repo_id,
+            folder_path="dummy",  # not used by the scheduler
+            every=every,
+            path_in_repo=path_in_repo,
+            repo_type=repo_type,
+            revision=revision,
+            private=private,
+            token=token,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+            hf_api=hf_api,
+        )
+        self._rows: List[Dict[str, Any]] = []
+        self._schema = schema
+    def append(self, row: Dict[str, Any]) -> None:
+        """Add a new item to be uploaded."""
+        with self.lock:
+            self._rows.append(row)
+    def push_to_hub(self):
+        # Check for new rows to push
+        with self.lock:
+            rows = self._rows
+            self._rows = []
+        if not rows:
+            return
+        print(f"Got {len(rows)} item(s) to commit.")
+        # Load images + create 'features' config for datasets library
+        schema: Dict[str, Dict] = self._schema or {}
+        path_to_cleanup: List[Path] = []
+        for row in rows:
+            for key, value in row.items():
+                # Load binary files if necessary
+                if schema[key]["_type"] in ("Image", "Audio"):
+                    # It's an image or audio: we load the bytes and remember to cleanup the file
+                    file_path = Path(value)
+                    if file_path.is_file():
+                        row[key] = {
+                            "path": file_path.name,
+                            "bytes": file_path.read_bytes(),
+                        }
+                        path_to_cleanup.append(file_path)
+        # Complete rows if needed
+        for row in rows:
+            for feature in schema:
+                if feature not in row:
+                    row[feature] = None
+        # Export items to Arrow format
+        table = pa.Table.from_pylist(rows)
+        # Add metadata (used by datasets library)
+        table = table.replace_schema_metadata(
+            {"huggingface": json.dumps({"info": {"features": schema}})}
+        )
+        # Write to parquet file
+        archive_file = tempfile.NamedTemporaryFile()
+        pq.write_table(table, archive_file.name)
+        # Upload
+        self.api.upload_file(
+            repo_id=self.repo_id,
+            repo_type=self.repo_type,
+            revision=self.revision,
+            path_in_repo=f"{uuid.uuid4()}.parquet",
+            path_or_fileobj=archive_file.name,
+        )
+        print(f"Commit completed.")
+        # Cleanup
+        archive_file.close()
+        for path in path_to_cleanup:
+            path.unlink(missing_ok=True)

style.css CHANGED Viewed

@@ -19,6 +19,14 @@
     font-size: calc(var(--block-title-text-size) + 2px);
 }
 #json-output {
     height: 96px !important;
 }

     font-size: calc(var(--block-title-text-size) + 2px);
 }
+#vote-label {
+    text-align: center;
+}
+#upvote-btn {
+    color: var(--button-primary-background-fill);
+}
 #json-output {
     height: 96px !important;
 }