Spaces:

uc-ctds
/

GDC-Cohort-Copilot

Running on Zero

songs1 commited on Jul 10, 2025

Commit

62c31e6

1 Parent(s): ab14f8c

correct schema

Files changed (2) hide show

app.py CHANGED Viewed

@@ -10,6 +10,12 @@ from guidance.models import Transformers
 from schema import GDCCohortSchema
 from transformers import AutoTokenizer, GPT2LMHeadModel, set_seed
 GDC_CASES_API_ENDPOINT = "https://api.gdc.cancer.gov/cases"
 MODEL_NAME = "labadvcomp/gdc-cohort-llm-gpt2-1m-dev"
 # TOKENIZER_NAME = MODEL_NAME # TODO figure out why tokenizer breaks model??
@@ -270,11 +276,7 @@ with gr.Blocks(css_paths="style.css") as demo:
     with gr.Row(equal_height=True):
         with gr.Column(scale=1, min_width=250):
             gr.Examples(
-                examples=[
-                    "bam files for TCGA-BRCA",
-                    "kidney or adrenal gland cancers with alcohol history",
-                    "tumor samples from male patients acute myeloid lymphoma",
-                ],
                 inputs=text_input,
             )
         with gr.Column(scale=4):
@@ -290,6 +292,13 @@ with gr.Blocks(css_paths="style.css") as demo:
                 elem_id="json-output",
             )
     with gr.Row():
         # Tab selectors
         tab_buttons = []

 from schema import GDCCohortSchema
 from transformers import AutoTokenizer, GPT2LMHeadModel, set_seed
+EXAMPLE_INPUTS = [
+    "bam files for TCGA-BRCA",
+    "kidney or adrenal gland cancers with alcohol history",
+    "tumor samples from male patients with acute myeloid lymphoma",
+]
 GDC_CASES_API_ENDPOINT = "https://api.gdc.cancer.gov/cases"
 MODEL_NAME = "labadvcomp/gdc-cohort-llm-gpt2-1m-dev"
 # TOKENIZER_NAME = MODEL_NAME # TODO figure out why tokenizer breaks model??
     with gr.Row(equal_height=True):
         with gr.Column(scale=1, min_width=250):
             gr.Examples(
+                examples=EXAMPLE_INPUTS,
                 inputs=text_input,
             )
         with gr.Column(scale=4):
                 elem_id="json-output",
             )
+    with gr.Row():
+        gr.Markdown(
+            "The generated cohort filter will autopopulate into the filter cards below. "
+            "Refine your search using the interactive checkboxes. "
+            "Note that many other options can be found by selecting the different tabs on the left."
+        )
     with gr.Row():
         # Tab selectors
         tab_buttons = []

schema.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from enum import Enum
-from typing import Annotated, Literal
 from pydantic import BaseModel, Field, StringConstraints
@@ -21,11 +22,6 @@ class OuterOp(Enum):
     AND = "and"
-class InnerStrContent(BaseModel):
-    field: Annotated[str, StringConstraints(min_length=1, max_length=64)]
-    value: list[Annotated[str, StringConstraints(min_length=1, max_length=128)]]
 class InnerAgeDxContent(BaseModel):
     field: Literal["cases.diagnoses.age_at_diagnosis"]
     value: Annotated[int, Field(ge=0, le=32872)]
@@ -51,6 +47,34 @@ class InnerCigStrtContent(BaseModel):
     value: Annotated[int, Field(ge=1900, le=2050)]
 class Inner(BaseModel):
     op: InnerOp
     content: (

 from enum import Enum
+from typing import Annotated, Literal, get_args, get_type_hints
+import yaml
 from pydantic import BaseModel, Field, StringConstraints
     AND = "and"
 class InnerAgeDxContent(BaseModel):
     field: Literal["cases.diagnoses.age_at_diagnosis"]
     value: Annotated[int, Field(ge=0, le=32872)]
     value: Annotated[int, Field(ge=1900, le=2050)]
+excluded_values = set()
+for alt_inner in [
+    InnerYearDxContent,
+    InnerCigDayContent,
+    InnerPackYrContent,
+    InnerCigStrtContent,
+]:
+    field_type = get_type_hints(alt_inner)["field"]
+    excluded_values |= set(get_args(field_type))
+with open("config.yaml", "r") as f:
+    CONFIG = yaml.safe_load(f)
+all_other_fields = tuple(
+    [
+        card["field"]
+        for tab in CONFIG["tabs"]
+        for card in tab["cards"]
+        if card["field"] not in excluded_values
+    ]
+)
+class InnerStrContent(BaseModel):
+    field: Literal[all_other_fields]  # type: ignore - runtime literal
+    value: list[Annotated[str, StringConstraints(min_length=1, max_length=128)]]
 class Inner(BaseModel):
     op: InnerOp
     content: (