diff --git a/website/static/img/CDD.png b/website/static/img/CDD.png deleted file mode 100644 index da6a0d98..00000000 Binary files a/website/static/img/CDD.png and /dev/null differ diff --git a/website/static/img/arch.png b/website/static/img/arch.png deleted file mode 100644 index 67924b12..00000000 Binary files a/website/static/img/arch.png and /dev/null differ diff --git a/website/static/img/basicPortal.png b/website/static/img/basicPortal.png deleted file mode 100644 index e22d0fbc..00000000 Binary files a/website/static/img/basicPortal.png and /dev/null differ diff --git a/website/static/img/build.png b/website/static/img/build.png deleted file mode 100644 index 2c2d5b73..00000000 Binary files a/website/static/img/build.png and /dev/null differ diff --git a/website/static/img/conversational-data-discovery-mockup.png b/website/static/img/conversational-data-discovery-mockup.png deleted file mode 100644 index da6a0d98..00000000 Binary files a/website/static/img/conversational-data-discovery-mockup.png and /dev/null differ diff --git a/website/static/img/demo-portal-cross-table.png b/website/static/img/demo-portal-cross-table.png deleted file mode 100644 index abea4e16..00000000 Binary files a/website/static/img/demo-portal-cross-table.png and /dev/null differ diff --git a/website/static/img/demo-portal-documentation-page.png b/website/static/img/demo-portal-documentation-page.png deleted file mode 100644 index ad161057..00000000 Binary files a/website/static/img/demo-portal-documentation-page.png and /dev/null differ diff --git a/website/static/img/demo-portal-documentation-page.webp b/website/static/img/demo-portal-documentation-page.webp deleted file mode 100644 index 7d6f1055..00000000 Binary files a/website/static/img/demo-portal-documentation-page.webp and /dev/null differ diff --git a/website/static/img/demo-portal-exploration-page.png b/website/static/img/demo-portal-exploration-page.png deleted file mode 100644 index e22d0fbc..00000000 Binary files a/website/static/img/demo-portal-exploration-page.png and /dev/null differ diff --git a/website/static/img/demo-portal-exploration-page.webp b/website/static/img/demo-portal-exploration-page.webp deleted file mode 100644 index 4f8b2926..00000000 Binary files a/website/static/img/demo-portal-exploration-page.webp and /dev/null differ diff --git a/website/static/img/demo-portal-homepage.png b/website/static/img/demo-portal-homepage.png deleted file mode 100644 index 9b8e3a8b..00000000 Binary files a/website/static/img/demo-portal-homepage.png and /dev/null differ diff --git a/website/static/img/demo-portal-homepage.webp b/website/static/img/demo-portal-homepage.webp deleted file mode 100644 index 79764419..00000000 Binary files a/website/static/img/demo-portal-homepage.webp and /dev/null differ diff --git a/website/static/img/demo-portal-search-and-aggregation.gif b/website/static/img/demo-portal-search-and-aggregation.gif deleted file mode 100644 index 6a1bdd1c..00000000 Binary files a/website/static/img/demo-portal-search-and-aggregation.gif and /dev/null differ diff --git a/website/static/img/demo-portal-search-and-aggregation.webm b/website/static/img/demo-portal-search-and-aggregation.webm deleted file mode 100644 index 8a91b222..00000000 Binary files a/website/static/img/demo-portal-search-and-aggregation.webm and /dev/null differ diff --git a/website/static/img/demo-search-and-aggregation.gif b/website/static/img/demo-search-and-aggregation.gif deleted file mode 100644 index 6a1bdd1c..00000000 Binary files a/website/static/img/demo-search-and-aggregation.gif and /dev/null differ diff --git a/website/static/img/documentation.png b/website/static/img/documentation.png deleted file mode 100644 index ad161057..00000000 Binary files a/website/static/img/documentation.png and /dev/null differ diff --git a/website/static/img/homepage.png b/website/static/img/homepage.png deleted file mode 100644 index 9b8e3a8b..00000000 Binary files a/website/static/img/homepage.png and /dev/null differ diff --git a/website/static/img/overture-platform-overview.png b/website/static/img/overture-platform-overview.png deleted file mode 100644 index 289dae03..00000000 Binary files a/website/static/img/overture-platform-overview.png and /dev/null differ diff --git a/website/static/img/workshop-architecture-diagram.png b/website/static/img/workshop-architecture-diagram.png deleted file mode 100644 index a1c7c9db..00000000 Binary files a/website/static/img/workshop-architecture-diagram.png and /dev/null differ diff --git a/website/static/img/workshop-architecture-diagram.webp b/website/static/img/workshop-architecture-diagram.webp deleted file mode 100644 index a4204dff..00000000 Binary files a/website/static/img/workshop-architecture-diagram.webp and /dev/null differ diff --git a/website/static/img/workshop-portal-preview.png b/website/static/img/workshop-portal-preview.png deleted file mode 100644 index 2c2d5b73..00000000 Binary files a/website/static/img/workshop-portal-preview.png and /dev/null differ diff --git a/website/static/img/workshop-portal-preview.webp b/website/static/img/workshop-portal-preview.webp deleted file mode 100644 index bbb85165..00000000 Binary files a/website/static/img/workshop-portal-preview.webp and /dev/null differ diff --git a/website/workshop/00-Intro.md b/website/workshop/00-Intro.md index 0e38a212..27bbcb4c 100644 --- a/website/workshop/00-Intro.md +++ b/website/workshop/00-Intro.md @@ -5,15 +5,15 @@ sidebar_position: 0 description: Build a searchable, FAIR-compliant data discovery portal from tabular CSV data using Elasticsearch, Arranger, and Stage. --- -:::caution Please complete the prerequisites below before arriving -Most importantly downloading the Docker images; the conference venue's Wi-Fi may be slow and unreliable. Thank you and looking forward to meeting you - **Mitchell Shiell, Ontario Institute for Cancer Research, [mshiell@oicr.on.ca](mailto:mshiell@oicr.on.ca)** +:::caution Please complete the prerequisites before arriving +Most importantly, download the Docker images in advance, the conference venue's Wi-Fi may be slow and unreliable. Looking forward to meeting you — **Mitchell Shiell, Ontario Institute for Cancer Research, [mshiell@oicr.on.ca](mailto:mshiell@oicr.on.ca)** ::: # IBC Workshop Prerequisites This workshop has been developed as part of the 19th Annual International Biocuration Conference, it will guide you through building a foundational data discovery portal for tabular CSV data using Elasticsearch, Arranger, and Stage. -![Demo search and aggregation](/img/workshop-portal-preview.webp) +![Demo search and aggregation](./images/workshop-portal-preview.webp) :::info 👋 Say hello If you're attending, feel free to [**drop a quick introduction**](https://github.com/overture-stack/docs/discussions/new?category=new-deployments&title=%5BIBC+Workshop%5D+Hello+from+%5BName%2C+Institution%5D&body=%2A%2AName+%26+affiliation%3A%2A%2A+%0A%0A%2A%2AType+of+data+I+work+with%3A%2A%2A+%0A%0A%2A%2AWhat+I%27m+hoping+to+get+out+of+the+session%3A%2A%2A+%0A%0A%2A%2AData+management+challenges+%28optional%29%3A%2A%2A+) before the day, this helps tailor the session to the room. Entirely optional. @@ -103,7 +103,7 @@ git clone -b IBCworkshop https://github.com/overture-stack/prelude.git These are not required but will make the workshop easier to follow:
-6. (Optional) Elasticvue:browser-based Elasticsearch GUI +6. (Optional) Elasticvue: browser-based Elasticsearch GUI [Elasticvue](https://elasticvue.com/installation) is a browser-based Elasticsearch GUI useful for inspecting indices, browsing documents, and troubleshooting. It is not required but helpful for understanding what's happening inside Elasticsearch during the workshop. diff --git a/website/workshop/01-Running-the-Demo.md b/website/workshop/01-Running-the-Demo.md index a5eb3b33..9c21bcca 100644 --- a/website/workshop/01-Running-the-Demo.md +++ b/website/workshop/01-Running-the-Demo.md @@ -3,21 +3,23 @@ id: running-the-demo title: Running the Demo sidebar_position: 1 description: Deploy the pre-configured demo portal to see the finished result before building from scratch. -draft: true --- +import demoVideo from './images/demo-search-and-aggregation.webm'; + # Running the Demo Before building anything from scratch, let's deploy the pre-configured demo portal and see what the end result looks like. This gives you a mental model of what each component does before we dive into configuration details. -If you have not done so yet clone the following repository. +If you have not done so yet, clone the following repository. ``` -git clone https://github.com/overture-stack/prelude.git +git clone -b IBCworkshop https://github.com/overture-stack/prelude.git cd prelude ``` @@ -30,12 +32,13 @@ make demo
Running on Windows? -| Platform | Command | -|---|---| +| Platform | Command | +| ------------------ | ----------------------------------- | | WSL2 (recommended) | `make demo` (in an Ubuntu terminal) | -| Native PowerShell | `.\run.ps1 demo` | +| Native PowerShell | `.\run.ps1 demo` | **One-time setup for native PowerShell:** allow local scripts to run by executing this once: + ```powershell Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser ``` @@ -70,13 +73,13 @@ Once the portal loads, take a few minutes to explore: The landing page provides an overview and navigation to available data tables. Note the navigation bar, branding, and layout, all of which are configurable. -![Portal home page](/img/homepage.webp) +![Portal home page](./images/homepage.webp) #### Data Exploration Page Navigate to the data exploration page from the top navigation. This is where Arranger's components are at work: -![exploration page](/img/basicPortal.webp) +![exploration page](./images/basicPortal.webp) - **Facet Panel (left sidebar):** Filter data by clicking on field values. Each facet corresponds to a field in the Elasticsearch index. The fields shown, their order, and their display names are all controlled by Arranger configuration files. @@ -90,7 +93,7 @@ Navigate to the data exploration page from the top navigation. This is where Arr The portal includes built-in documentation pages rendered from markdown files in the `docs/` directory. The content you are reading right now may be served through this same mechanism. -![documentation page](/img/documentation.webp) +![documentation page](./images/documentation.webp) ### What's Running @@ -176,16 +179,18 @@ Before moving on, confirm: ### Stopping the Demo -We'll keep the demo running as a reference while we walk through the architecture. When your ready to remove it run: +We'll keep the demo running as a reference while we walk through the architecture. When your ready to remove it, run: ```bash make reset ``` :::tip Windows (PowerShell) + ```powershell .\run.ps1 reset ``` + ::: **Next:** Now that you've seen the working portal, let's understand how the pieces fit together. diff --git a/website/workshop/02-Architecture.md b/website/workshop/02-Architecture.md index 0be3623d..81131343 100644 --- a/website/workshop/02-Architecture.md +++ b/website/workshop/02-Architecture.md @@ -3,14 +3,13 @@ id: architecture title: Architecture sidebar_position: 2 description: How data flows from a CSV file through PostgreSQL and Elasticsearch to the browser-based search portal. -draft: true --- # Architecture Now that you've seen the running portal, let's walk through how data flows from a CSV file to the search interface. -![Architecture Diagram](/img/workshop-architecture-diagram.webp "Architecture Diagram") +![Architecture Diagram](./images/workshop-architecture-diagram.webp "Architecture Diagram") | Component | Type | Description | | ---------------------------------------------------------------------------------------------------------- | --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -88,7 +87,7 @@ Services start in dependency order: PostgreSQL and Elasticsearch must be healthy The components used in this workshop are part of the broader [Overture](https://overture.bio) open-source platform for research data management. The search and exploration stack we're using can be extended with additional services: -![Platform Integration](/img/overture-platform-overview.webp) +![Platform Integration](./images/overture-platform-overview.webp) - **Lectern:** Data dictionary management (define and enforce data schemas) - **Lyric:** Tabular data submission with validation @@ -104,7 +103,7 @@ Structuring data through a search API like Arranger makes it **machine-accessibl The platform connects to Arranger via the [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) and is designed around four core principles: data minimisation by default, no action without explicit researcher consent, sandboxed code execution, and fully reproducible sessions. Because research data is often sensitive, the platform runs on sovereign infrastructure rather than routing queries through commercial AI providers. -![CDD Conceptual Mock](/img/conversational-data-discovery-mockup.webp) +![CDD Conceptual Mock](./images/conversational-data-discovery-mockup.webp) :::info **The interface shown above is a conceptual mock-up**. CDD is under active development and is not covered in this workshop. diff --git a/website/workshop/03-Data-Preparation.md b/website/workshop/03-Data-Preparation.md index b2a0001a..0eec6e54 100644 --- a/website/workshop/03-Data-Preparation.md +++ b/website/workshop/03-Data-Preparation.md @@ -3,7 +3,6 @@ id: data-preparation title: Data Preparation sidebar_position: 3 description: How to structure and format your CSV data to meet the requirements for loading into the portal. -draft: true --- # Data Preparation @@ -33,7 +32,7 @@ Your CSV column headers become field names in PostgreSQL, Elasticsearch, and Gra | **Format** | CSV (comma-separated); other delimiters supported via `--delimiter` but for simplicity we recommend using comma-separated files. | | **Header row** | Required as the first line | | **Prohibited characters** | `: > < . [space] , / \ ? # [ ] { } " * \| + @ & ( ) ! ^` | -| **Max length** | A maximum 63 characters per header name, PostgreSQL silently truncates longer identifiers, which can cause mismatches between your schema and index | +| **Max length** | A maximum of 63 characters per header name, PostgreSQL silently truncates longer identifiers, which can cause mismatches between your schema and index | | **Reserved words** | These are internal field names used by Elasticsearch and GraphQL. Using them will conflict with system internals and cause indexing or query errors: `_type` `_id` `_source` `_all` `_parent` `_field_names` `_routing` `_index` `_size` `_timestamp` `_ttl` `_meta` `_doc` `__typename` `__schema` `__type` | | **Best practices** | Use `snake_case` or `camelCase`, lowercase, descriptive but concise, no special characters or spaces | @@ -103,7 +102,7 @@ If you're working with data that has any access restrictions, use anonymized or #### Recommended Data Size -There are no strict size limits beyond Docker and Elasticsearch resource constraints. In fact we've scaled this resource to hundreds millions of records. However, for development and testing, a representative sample of approximately **500 records** works well. You can start small and load larger datasets once your configuration is working. +There are no strict size limits beyond Docker and Elasticsearch resource constraints. In fact we've scaled this resource to hundreds of millions of records. However, for development and testing, a representative sample of approximately **500 records** works well. You can start small and load larger datasets once your configuration is working. ### Checkpoint diff --git a/website/workshop/04-Generating-Configurations.md b/website/workshop/04-Generating-Configurations.md index 324fb94e..d668c214 100644 --- a/website/workshop/04-Generating-Configurations.md +++ b/website/workshop/04-Generating-Configurations.md @@ -3,7 +3,6 @@ id: generating-configurations title: Generating Configurations sidebar_position: 4 description: Auto-generate PostgreSQL schemas, Elasticsearch mappings, and Arranger configuration files from your CSV data. -draft: true --- # Generating Configurations @@ -12,13 +11,13 @@ Instead of writing PostgreSQL schemas, Elasticsearch mappings, and Arranger conf Navigate to **Config Generator** in the Stage portal navigation bar (visible once Stage is running). - +![Config Generator page in the Stage portal navigation](./images/config-generator-page.webp) ### Step 1: Provide CSV Data Upload a `.csv` file using the **Upload .csv file** button, or paste CSV content directly into the text area. Once loaded, a preview of the first five rows is shown so you can confirm the correct file was used. - +![CSV upload area and preview table](./images/csv-upload-area.webp) ### Step 2: Configure Options @@ -27,7 +26,7 @@ Upload a `.csv` file using the **Upload .csv file** button, or paste CSV content | **Index name** | The name of the Elasticsearch index. Auto-populated from the CSV filename; edit if needed (e.g. `datatable1`). | | **Table name** | The name of the PostgreSQL table. Defaults to the same value as the index name. | - + ### Step 3: Generate and Copy @@ -45,7 +44,7 @@ Click **Generate Configs**. Once complete, the output panel shows a tabbed view Use the **Copy** button on each tab to copy the content, then paste it into the corresponding file in your project. - +![Generated output panel with tabs and Copy button](./images/generated-output-panel.webp) ### Reviewing the Output @@ -264,7 +263,7 @@ Display names are auto-generated by converting `snake_case` to Title Case. Revie
:::tip -For a full reference on `extended.json`, see the [Arranger extended configuration docs](https://docs.overture.bio/docs/core-software/Arranger/usage/arranger-components). +For a full reference on `extended.json`, see the [Arranger extended configuration docs](https://docs.overture.bio/docs/core-software/arranger/usage/arranger-components). ::: #### arranger/table.json diff --git a/website/workshop/05-Docker-Configuration.md b/website/workshop/05-Docker-Configuration.md index 62fb6d7f..b6329d0d 100644 --- a/website/workshop/05-Docker-Configuration.md +++ b/website/workshop/05-Docker-Configuration.md @@ -3,7 +3,6 @@ id: docker-configuration title: Docker Configuration sidebar_position: 5 description: Walk through the docker-compose.yml service definitions and environment variables to wire configuration files into each container. -draft: true --- # Docker Configuration @@ -190,7 +189,7 @@ make platform ::: :::info -For future configuration changes (once your own data is loaded), `make restart` is sufficient, it reloads configs without wiping data. If you wish to wipe the data as-well run `make reset` +For future configuration changes (once your own data is loaded), `make restart` is sufficient, it reloads configs without wiping data. If you wish to wipe the data as well, run `make reset` ::: #### Troubleshooting @@ -214,7 +213,7 @@ curl -u elastic:myelasticpassword http://localhost:9200/_cluster/health?pretty make reset ``` -:::tip Windows (PowerShell) — full reset +:::tip Windows (PowerShell) - full reset ```powershell .\run.ps1 reset diff --git a/website/workshop/06-Loading-Data.md b/website/workshop/06-Loading-Data.md index 6146c4d0..a54819ba 100644 --- a/website/workshop/06-Loading-Data.md +++ b/website/workshop/06-Loading-Data.md @@ -3,14 +3,13 @@ id: loading-data title: Loading Data sidebar_position: 6 description: Use the Conductor CLI to load CSV data into PostgreSQL and index it into Elasticsearch for search. -draft: true --- # Loading Data With the infrastructure configured, it's time to load data into the portal. Conductor is a CLI tool that reads CSV files, loads each row into PostgreSQL (persistent storage), then indexes them into Elasticsearch as structured documents for search. -Conductor runs as a Docker container — no Node.js installation required. A wrapper script at the root of the repository handles the Docker details for you. +Conductor runs as a Docker container, no Node.js installation required. A wrapper script at the root of the repository handles the Docker details for you. :::info Run all `./conductor` commands from the **root of the `prelude` repository** (i.e. where `docker-compose.yml` lives). @@ -31,7 +30,7 @@ Add that line to your `~/.zshrc` (Zsh) or `~/.bashrc` (Bash) and reload: source ~/.zshrc # or source ~/.bashrc ``` -You can then run `conductor upload ...` from any directory. The script resolves the `data/` folder relative to its own location in the repo, so paths like `./data/datatable1.csv` still refer to the repo's `data/` directory — run the command from the repo root or use an absolute path to a file outside it. +You can then run `conductor upload ...` from any directory. The script resolves the `data/` folder relative to its own location in the repo, so paths like `./data/datatable1.csv` still refer to the repo's `data/` directory, run the command from the repo root or use an absolute path to a file outside it.
diff --git a/website/workshop/07-Troubleshooting.md b/website/workshop/07-Troubleshooting.md index e6c94872..ba348811 100644 --- a/website/workshop/07-Troubleshooting.md +++ b/website/workshop/07-Troubleshooting.md @@ -3,7 +3,6 @@ id: troubleshooting title: Troubleshooting sidebar_position: 7 description: A layered approach to diagnosing issues in the portal stack, from Docker and databases through to the browser. -draft: true --- # Troubleshooting @@ -115,7 +114,7 @@ If Arranger responds to the GraphQL query above but the data table or facet pane | `extended.json` | Dot notation | `data.field_name` | | `base.json` | Alias name (`esIndex`) | `datatable1_centric` | -Another common `table.json` issue is the `query` field, which must use the correct GraphQL traversal path (`hits`, `edges`, `nodes`) to reach the field value. An incorrect path here will cause columns to render empty even when data is present. See the [Arranger table configuration docs](https://docs.overture.bio/docs/core-software/Arranger/usage/arranger-components#table-configuration-tablejson) for the expected structure. +Another common `table.json` issue is the `query` field, which must use the correct GraphQL traversal path (`hits`, `edges`, `nodes`) to reach the field value. An incorrect path here will cause columns to render empty even when data is present. See the [Arranger table configuration docs](https://docs.overture.bio/docs/core-software/arranger/usage/arranger-components#table-configuration-tablejson) for the expected structure. ### Step 5: Check Stage and the Browser diff --git a/website/workshop/08-Portal-Customization.md b/website/workshop/08-Portal-Customization.md index 2ed24b98..13cd5f7f 100644 --- a/website/workshop/08-Portal-Customization.md +++ b/website/workshop/08-Portal-Customization.md @@ -3,7 +3,6 @@ id: portal-customization title: Portal Customization sidebar_position: 8 description: Customize the portal's branding, color theme, and navigation, and add multiple data exploration pages. -draft: true --- # Portal Customization @@ -215,7 +214,7 @@ The platform is considerably more flexible than what this workshop covers. Two c **QuickSearch (purple box)** adds a type-ahead search input to a data table that lets users find records by typing a field value directly, rather than browsing facet filters. It works by adding edge n-gram tokenization to the Elasticsearch mapping and enabling the feature in Arranger's `extended.json` and `facets.json`. It's well-suited to datasets where users already know the identifier they're looking for (a gene name, sample ID, etc.). -![](/img/demo-portal-cross-table.webp) +![](./images/demo-portal-cross-table.webp) **Cross-table search (green box)** allows a selection made in one data table to propagate as a filter in another, using a shared identifier across heterogeneous datasets. For example, selecting a gene in one table can automatically filter a second table to show only records that share that gene. This is particularly useful for multi-omics or linked clinical and molecular datasets. @@ -230,4 +229,4 @@ At this point you should understand: 3. Where theme colors and fonts are defined (`apps/stage/components/theme/`) 4. That adding a new data table requires a component, a page route, environment variables, and an Arranger service -**Extension Task:** If you have time or want to go further, the [Extension Task](./10-Extension-Task.md) introduces schema-validated data submission using Lectern and Lyric — useful if your data is hierarchical or requires enforced data quality before it reaches the portal. +**Extension Task:** If you have time or want to go further, the [Extension Task](./10-Extension-Task.md) introduces schema-validated data submission using Lectern and Lyric, useful if your data is hierarchical or requires enforced data quality before it reaches the portal. diff --git a/website/workshop/09-Next-Steps.md b/website/workshop/09-Next-Steps.md index ae1aadc6..2f422878 100644 --- a/website/workshop/09-Next-Steps.md +++ b/website/workshop/09-Next-Steps.md @@ -3,7 +3,6 @@ id: next-steps title: Next Steps sidebar_position: 9 description: Resources, commands reference, and guidance for adapting the portal to your own data and extending the platform. -draft: true --- # Next Steps @@ -17,7 +16,7 @@ You now have a locally running data discovery portal with: - **Arranger:** providing a GraphQL search API and configurable UI components - **Stage:** rendering a browser-based portal with faceted search, sortable tables, and data export -![Architecture Diagram](/img/workshop-architecture-diagram.webp "Phase 1 Architecture Diagram") +![Architecture Diagram](./images/workshop-architecture-diagram.webp "Phase 1 Architecture Diagram") You've seen how to generate configuration files from CSV data, wire services together through Docker Compose, load data with Conductor, customize the portal's search interface and appearance, and understand the deployment architecture for making portals network-accessible. @@ -38,7 +37,7 @@ Setting up and maintaining this infrastructure typically requires a system admin The search and exploration stack used in this workshop is part of the broader Overture platform. Depending on your needs, you can extend it with: -![Platform Integration](/img/overture-platform-overview.webp) +![Platform Integration](./images/overture-platform-overview.webp) | Need | Overture Service | What It Does | | -------------------------- | ---------------- | ------------------------------------------------------------------ | @@ -63,7 +62,7 @@ In addition to this our team is building a **Conversational Data Discovery (CDD) - **Understanding what data is available** without being hardcoded to a specific schema. - **Translating natural language questions into validated queries.** A researcher asks "how many samples have a TP53 mutation?" and the model constructs the correct filter against your specific field names. - **Grounding its responses in your actual data** rather than hallucinating field names or value ranges, it reads the schema before generating any query. -- **Executing analysis code in a sandboxed workspace.** Once a query is confirmed, the model can generate Python to analyse and visualise results, running it in an isolated container where the researcher approves every step. +- **Executing analysis code in a sandboxed workspace.** Once a query is confirmed, the model can generate Python to analyse and visualize results, running it in an isolated container where the researcher approves every step. :::info **Why self-hosted models:** Research data is often sensitive. Routing queries through commercial AI providers is not viable for many research contexts. The CDD platform can be used to run capable models, adjacent to the data, with full control over the stack. @@ -77,7 +76,7 @@ Whether you're adapting the platform to your own data, running into issues after If you have a few minutes, we'd appreciate your feedback on the workshop. Your responses help us improve the content, pacing, and hands-on exercises for future sessions. -**[Fill in the post-workshop survey →](#)** +**[Fill in the post-workshop survey →](https://docs.google.com/forms/d/1iPKthD-jZRd3CN5ZFV_55mAPhNWSSzjN40qq-9zbnDM/edit)** ### Resources diff --git a/website/workshop/10-Extension-Task.md b/website/workshop/10-Extension-Task.md index a0533c24..2515c200 100644 --- a/website/workshop/10-Extension-Task.md +++ b/website/workshop/10-Extension-Task.md @@ -2,177 +2,170 @@ id: extension-task title: Extension Task sidebar_position: 10 -description: Extend the portal with structured data submission using Lectern data dictionaries and the Lyric tabular submission service. -draft: true +description: Use the Dictionary Playground to design a Lectern data dictionary with typed fields, controlled vocabularies, and multi-schema relationships — and see how it connects to submission validation and search indexing. --- -# Data Submission +# Extension Task -The workshop uses flat tabular data: each CSV row is an independent record. This works well for many datasets, but some research data is inherently hierarchical: a patient has multiple specimens, each specimen has multiple samples, each sample has multiple assay results. Flattening this into a single table means repeating parent fields on every row, which gets unwieldy and makes it harder to enforce consistency across related records. +Before a single record is submitted, someone has to decide: _what fields exist, what types they hold, what values are permitted, and which are required_. That design work is biocuration, and without a formal tool to capture it, those decisions live in informal documents, shared spreadsheets, and institutional memory. When submitters inevitably interpret things differently, the inconsistencies surface only after data has been collected. -If your data fits that pattern, or if you need a defined schema that enforces data quality, a submission workflow that validates records before they reach the database, and an audit trail of what was submitted and when, the right approach is a schema-validated submission workflow. This extension introduces two Overture services that provide exactly that: **Lectern** for data dictionary management and **Lyric** for schema-validated tabular data submission. +Lectern is Overture's data dictionary management service. It gives those design decisions a persistent, versioned, machine-readable home. Controlled vocabularies become enforced constraints. Schema changes are tracked as diffs rather than email threads. And the same dictionary that guides submitters can drive automated validation at ingestion time. + +The portal includes a **Dictionary Playground**: a live editor where you can write a Lectern schema and see it rendered as an interactive table in real time, with no server required. :::info -This is an extension task for participants who have completed the core workshop. It introduces concepts and tooling that build on the Arranger and Stage stack you already have running. +This is an extension task for participants who have completed the core workshop. ::: -### Why Structured Submission Matters +## Core Concepts + +A Lectern **dictionary** groups one or more **schemas** together. Each schema represents a data entity (a concept like `donor` or `specimen`). Within a schema, each **field** specifies: + +- **Name:** the field identifier (e.g. `specimen_id`) +- **Value type:** `string`, `integer`, `number`, or `boolean` +- **Restrictions:** required status, permitted values (`codeList`), regex patterns, or numeric ranges + +Schemas within a dictionary can reference each other through shared identifier fields, expressing a hierarchy (a donor has specimens) without flattening everything into a single table. + +## Step 1: Open the Playground + +Navigate to **http://localhost:3000/dictionary/playground**. + +The playground opens with a demo dictionary. Clear the editor and replace it with this minimal starting point, a single `donor` entity with one required field: + +```json +{ + "name": "my-dictionary", + "version": "1.0", + "schemas": [ + { + "name": "donor", + "description": "Core donor record", + "fields": [ + { + "name": "donor_id", + "valueType": "string", + "description": "Unique identifier for the donor", + "restrictions": { "required": true } + } + ] + } + ] +} +``` + +The live preview on the right updates as you type. The validation bar below the editor label shows whether your JSON is a valid Lectern dictionary. -Loading CSVs directly is fast, but it puts the burden of data quality on the person preparing the file. There is nothing stopping someone from submitting the wrong field types, missing required values, or using inconsistent terminology. Over time, this erodes the reliability of the data in the portal. +## Step 2: Add Fields with Restrictions -A schema-validated submission workflow inverts this: the schema is defined once, centrally, and every submission is checked against it before a single record reaches the database. Researchers receive immediate, field-level feedback on what is wrong and why, rather than discovering problems later when data appears malformed in the portal. +Restrictions are where the curation work lives. Extend the `fields` array with three more fields, each demonstrating a different restriction type: -### Lectern: Data Dictionaries +```json +"fields": [ + { + "name": "donor_id", + "valueType": "string", + "description": "Unique identifier for the donor", + "restrictions": { + "required": true, + "regex": "^DO-[0-9]{3,}$" + } + }, + { + "name": "sex", + "valueType": "string", + "description": "Biological sex of the donor", + "restrictions": { + "required": true, + "codeList": ["Female", "Male", "Other", "Unknown"] + } + }, + { + "name": "age_at_diagnosis", + "valueType": "integer", + "description": "Age in years at time of primary diagnosis", + "restrictions": { + "range": { "min": 0, "max": 120 } + } + }, + { + "name": "primary_diagnosis", + "valueType": "string", + "description": "Primary disease or condition", + "restrictions": { "required": true } + } +] +``` -Lectern is Overture's data dictionary management service. A **data dictionary** is a formal specification of what your data looks like: the schemas it contains, the fields each schema defines, the type and constraints on each field, and the relationships between schemas. +Watch the preview update. Notice how each restriction type renders differently in the table: -#### What a Schema Defines +| Restriction | Field | What it enforces | +| ---------------- | ------------------ | -------------------------------------------------------- | +| `required: true` | `donor_id`, `sex` | Field must be present and non-empty in every record | +| `regex` | `donor_id` | Value must match `DO-` followed by at least three digits | +| `codeList` | `sex` | Value must be one of the listed options | +| `range` | `age_at_diagnosis` | Integer must fall between 0 and 120 inclusive | -Each schema in a Lectern dictionary corresponds to a data entity: a concept like `participant`, `sample`, or `clinical_record`. Within a schema, each field specifies: +The `codeList` restriction is one of the most immediately useful tools in biocuration: it turns a naming convention into an enforced constraint. Every submission is checked against the list; values that don't match are rejected before they enter the system. -- **Name:** the field identifier (e.g. `sample_id`) -- **Value type:** `string`, `integer`, `number`, `boolean` -- **Is array:** whether the field holds a single value or a list -- **Restrictions:** required status, permitted values (`codeList`), regex patterns, numeric ranges, or conditional logic +## Step 3: Add a Second Schema -
-Example: a minimal Lectern schema +Dictionaries model relationships, not just individual entities. Add a `specimen` schema that references the donor it was collected from via a shared `donor_id` field: ```json { - "name": "sample", - "description": "Core sample metadata for submitted records", + "name": "specimen", + "description": "Specimen collected from a donor", "fields": [ { - "name": "sample_id", + "name": "specimen_id", "valueType": "string", - "description": "Unique identifier for the sample", + "description": "Unique identifier for the specimen", "restrictions": { "required": true } }, { - "name": "tissue_type", + "name": "donor_id", "valueType": "string", - "description": "Tissue of origin", + "description": "Identifier of the donor this specimen was collected from", + "restrictions": { "required": true } + }, + { + "name": "specimen_type", + "valueType": "string", + "description": "Classification of the specimen", "restrictions": { "required": true, - "codeList": ["blood", "bone marrow", "lymph node", "tissue"] + "codeList": [ + "Primary tumour", + "Recurrent tumour", + "Metastatic tumour", + "Normal - solid tissue", + "Normal - blood derived" + ] } - }, - { - "name": "age_at_diagnosis", - "valueType": "integer", - "description": "Patient age in years at time of diagnosis", - "restrictions": { "range": { "min": 0, "max": 120 } } } ] } ``` -
- -A dictionary groups one or more schemas together and is versioned; Lectern tracks changes between versions so you can understand what changed between schema releases and communicate updated requirements to data submitters. +Add this object as a second entry in the `schemas` array. The preview now shows two schema tables. The `donor_id` field appears in both; this shared identifier is how hierarchy is expressed: specimens reference donors without collapsing them into a single row. -#### The Lectern UI +This is the difference between a flat CSV and a relational data model. -The **Lectern Viewer** is a web-based interface for exploring and communicating data dictionary requirements. Rather than asking submitters to read raw JSON schema definitions, the viewer renders dictionaries as interactive, searchable tables with expandable validation rules, conditional logic explanations, and downloadable CSV templates pre-formatted for the current schema. - -:::info -The Lectern Viewer is currently in active development. It is being built initially for the Pan-Canadian Genome Library (PCGL) and will be available as a standalone component that any Overture deployment can integrate. -::: - -Key features of the viewer: - -- **Interactive schema tables:** field-by-field documentation with types, restrictions, and examples -- **Version switching:** compare dictionary versions and see what changed between releases -- **Template downloads:** generate CSV templates from the current schema to guide submitters -- **Relationship diagrams:** visualize how schemas connect through shared identifiers - -#### Adding Lectern to Your Stack - -Lectern requires MongoDB to store dictionaries. Add both to your `docker-compose.yml`: - -```yaml showLineNumbers -lectern: - image: ghcr.io/overture-stack/lectern:latest - ports: - - "3000:3000" - environment: - MONGO_HOST: lectern-mongo - MONGO_PORT: 27017 - MONGO_DB: lectern - depends_on: - - lectern-mongo - -lectern-mongo: - image: mongo:latest - ports: - - "27017:27017" - environment: - MONGO_INITDB_ROOT_USERNAME: admin - MONGO_INITDB_ROOT_PASSWORD: password -``` +## What Comes Next -Once running, the Lectern API is available at `http://localhost:3000` and the Swagger UI at `http://localhost:3000/api-docs`. Use the Swagger UI to create your first dictionary by posting a schema definition. +The playground validates and previews dictionaries entirely in the browser. To put a dictionary to work (driving submission validation and appearing in the portal's Dictionary Viewer), you connect it to the rest of the Overture stack. -### Lyric: Tabular Data Submission +### Lyric: Submission and Validation -Lyric is Overture's tabular data submission service. It sits between data submitters and the database, validating every submission against a Lectern dictionary before any data is committed. +[Lyric](https://docs.overture.bio/docs/core-software/lyric/overview) is Overture's data submission service. Once a dictionary is published to a running Lectern server, Lyric uses it to validate incoming records at submission time. Every field restriction you defined (required fields, code lists, regex patterns, numeric ranges) becomes an automated check that runs before a record is accepted. Submitters get immediate, field-level feedback rather than discovering inconsistencies after the fact. -#### How the Submission Workflow Works +### Maestro: Indexing for Search -Lyric uses a **staged submission** model: data is uploaded and validated incrementally before being committed. This allows submitters to catch and fix errors across multiple uploads rather than needing a perfectly complete file upfront. +[Maestro](https://docs.overture.bio/docs/core-software/maestro/overview) is Overture's indexing service. Once validated data is in the system, Maestro transforms it into Elasticsearch indices that power the portal's search and exploration features. The schema you defined in Lectern informs how fields are indexed; typed fields, controlled vocabularies, and relationships carry through from the dictionary into the search layer. -1. **Upload:** submit a TSV file against a named schema in a registered Lectern dictionary -2. **Validate:** Lyric checks every row and field against the schema rules: required fields, code lists, ranges, conditional logic -3. **Review:** field-level validation errors are returned immediately; submitters can correct and re-upload -4. **Commit:** once validation passes, data is committed to Lyric's PostgreSQL database -5. **Index:** Maestro picks up the publication event and indexes the new records into Elasticsearch, making them searchable in the portal +Together, Lectern -> Lyric -> Maestro form the data pipeline: you define the schema, submitters upload against it, and the resulting data becomes searchable in the portal. -#### The Submission UI - -:::info -Lyric's submission UI is currently under development. It will provide a browser-based interface for submitting, reviewing, and managing data without needing to interact with the API directly, lowering the barrier for researchers who are not comfortable with command-line tools. +:::tip +The full Lectern schema reference (including all restriction types, versioning, and foreign key syntax) is documented at [docs.overture.bio](https://docs.overture.bio/docs/core-software/Lectern/dictionaryReference). ::: - -Until the UI is available, submissions are made via Lyric's REST API, documented through Swagger at `http://localhost:3232/api-docs` by default. - -#### Adding Lyric to Your Stack - -Lyric uses the same PostgreSQL instance as the rest of the workshop stack. Add it to your `docker-compose.yml`: - -```yaml showLineNumbers -lyric: - image: ghcr.io/overture-stack/lyric:latest - ports: - - "3232:3232" - environment: - LECTERN_URL: http://lectern:3000 - POSTGRES_HOST: postgres - POSTGRES_PORT: 5432 - POSTGRES_DB: lyric - POSTGRES_USER: postgres - POSTGRES_PASSWORD: password - depends_on: - - lectern - - postgres -``` - -### How It Fits Together - -With Lectern and Lyric added, the full submission-to-discovery flow looks like this: - -1. **Define your schema** in Lectern: field names, types, required fields, permitted values -2. **Submitters upload data** through Lyric, which validates each row against the Lectern schema -3. **On commit**, Lyric stores records in PostgreSQL -4. **Maestro indexes** the committed data into Elasticsearch -5. **Arranger and Stage** make the indexed data searchable in the portal, exactly as in the core workshop - -The portal itself does not change. What changes is the path data takes to get there: instead of a CSV loaded directly by Conductor, data arrives through a validated, audited submission workflow with a defined schema enforcing consistency at every step. - -### Further Reading - -- [Lectern documentation](https://docs.overture.bio/docs/under-development/lectern/) -- [Building Lectern dictionaries](https://docs.overture.bio/docs/core-software/Lectern/dictionaryReference) -- [Lyric documentation](https://docs.overture.bio/docs/under-development/lyric/) -- [Maestro documentation](https://docs.overture.bio/docs/core-software/Maestro/overview) -- [Overture support forum](https://github.com/overture-stack/roadmap/discussions/categories/support) diff --git a/website/static/img/basicPortal.webp b/website/workshop/images/basicPortal.webp similarity index 100% rename from website/static/img/basicPortal.webp rename to website/workshop/images/basicPortal.webp diff --git a/website/workshop/images/config-generator-page.webp b/website/workshop/images/config-generator-page.webp new file mode 100644 index 00000000..a0676637 Binary files /dev/null and b/website/workshop/images/config-generator-page.webp differ diff --git a/website/static/img/conversational-data-discovery-mockup.webp b/website/workshop/images/conversational-data-discovery-mockup.webp similarity index 100% rename from website/static/img/conversational-data-discovery-mockup.webp rename to website/workshop/images/conversational-data-discovery-mockup.webp diff --git a/website/workshop/images/csv-upload-area.webp b/website/workshop/images/csv-upload-area.webp new file mode 100644 index 00000000..939f74ae Binary files /dev/null and b/website/workshop/images/csv-upload-area.webp differ diff --git a/website/static/img/demo-portal-cross-table.webp b/website/workshop/images/demo-portal-cross-table.webp similarity index 100% rename from website/static/img/demo-portal-cross-table.webp rename to website/workshop/images/demo-portal-cross-table.webp diff --git a/website/static/img/demo-search-and-aggregation.webm b/website/workshop/images/demo-search-and-aggregation.webm similarity index 100% rename from website/static/img/demo-search-and-aggregation.webm rename to website/workshop/images/demo-search-and-aggregation.webm diff --git a/website/static/img/documentation.webp b/website/workshop/images/documentation.webp similarity index 100% rename from website/static/img/documentation.webp rename to website/workshop/images/documentation.webp diff --git a/website/workshop/images/generated-output-panel.webp b/website/workshop/images/generated-output-panel.webp new file mode 100644 index 00000000..76d01112 Binary files /dev/null and b/website/workshop/images/generated-output-panel.webp differ diff --git a/website/static/img/homepage.webp b/website/workshop/images/homepage.webp similarity index 100% rename from website/static/img/homepage.webp rename to website/workshop/images/homepage.webp diff --git a/website/static/img/overture-platform-overview.webp b/website/workshop/images/overture-platform-overview.webp similarity index 100% rename from website/static/img/overture-platform-overview.webp rename to website/workshop/images/overture-platform-overview.webp diff --git a/website/workshop/images/workshop-architecture-diagram.webp b/website/workshop/images/workshop-architecture-diagram.webp new file mode 100644 index 00000000..6a5d6be6 Binary files /dev/null and b/website/workshop/images/workshop-architecture-diagram.webp differ diff --git a/website/workshop/images/workshop-portal-preview.webp b/website/workshop/images/workshop-portal-preview.webp new file mode 100644 index 00000000..6a64b7da Binary files /dev/null and b/website/workshop/images/workshop-portal-preview.webp differ diff --git a/website/workshopSidebars.ts b/website/workshopSidebars.ts index 272c5ae2..d0f8a429 100644 --- a/website/workshopSidebars.ts +++ b/website/workshopSidebars.ts @@ -3,56 +3,16 @@ import type { SidebarsConfig } from "@docusaurus/plugin-content-docs"; const sidebars: SidebarsConfig = { workshopSidebar: [ "prerequisites", - { - type: "html", - value: - 'pending Running the Demo', - }, - { - type: "html", - value: - 'pending Architecture', - }, - { - type: "html", - value: - 'pending Data Preparation', - }, - { - type: "html", - value: - 'pending Generating Configurations', - }, - { - type: "html", - value: - 'pending Docker Configuration', - }, - { - type: "html", - value: - 'pending Loading Data', - }, - { - type: "html", - value: - 'pending Troubleshooting', - }, - { - type: "html", - value: - 'pending Portal Customization', - }, - { - type: "html", - value: - 'pending Next Steps', - }, - { - type: "html", - value: - 'pending Extension Task', - }, + "running-the-demo", + "architecture", + "data-preparation", + "generating-configurations", + "docker-configuration", + "loading-data", + "troubleshooting", + "portal-customization", + "next-steps", + "extension-task", ], };