diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..813d079 --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +# Copernicus Account Credentials +COPERNICUS_USERNAME="" +COPERNICUS_PASSWORD="" + +# Copernicus S3 (CDSE) Credentials +AWS_ACCESS_KEY_ID="" +AWS_SECRET_ACCESS_KEY="" +COPERNICUS_S3_ENDPOINT="https://eodata.dataspace.copernicus.eu" diff --git a/.gemini/GEMINI.md b/.gemini/GEMINI.md new file mode 100644 index 0000000..e18d7aa --- /dev/null +++ b/.gemini/GEMINI.md @@ -0,0 +1 @@ +You must strictly adhere to the project rules defined in `docs/agents/agent_instructions.md`. Read this file before making any significant architectural, geospatial data processing changes, or general code modifications. Use `read_file` to load `docs/agents/agent_istructions.md`. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..442bd20 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1 @@ +You must strictly adhere to the project rules defined in `docs/agents/agent_instructions.md`. Read this file before making any significant architectural, geospatial data processing changes, or general code modifications. diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 26f2f38..5f7f5dd 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -12,19 +12,19 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 + - name: Set up Python 3.13 + uses: actions/setup-python@v6 with: - python-version: '3.11' + python-version: '3.13' - name: Install UV run: | make uv-install-venv - name: Cache UV virtualenv and dependencies - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: | ~/.cache/uv diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index b35e008..78cb013 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -12,19 +12,19 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 + - name: Set up Python 3.13 + uses: actions/setup-python@v6 with: - python-version: '3.11' + python-version: '3.13' - name: Install UV run: | make uv-install-venv - name: Cache UV virtualenv and dependencies - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: | ~/.cache/uv @@ -34,7 +34,7 @@ jobs: uv-${{ runner.os }}- - name: Cache pre-commit - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: | ~/.cache/pre-commit diff --git a/.github/workflows/publish-gh-pages.yml b/.github/workflows/publish-gh-pages.yml index d9b2ad7..b571f3e 100644 --- a/.github/workflows/publish-gh-pages.yml +++ b/.github/workflows/publish-gh-pages.yml @@ -18,21 +18,21 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 + - name: Set up Python 3.13 + uses: actions/setup-python@v6 with: - python-version: '3.11' + python-version: '3.13' - name: Install UV run: | make uv-install-venv - name: Cache UV virtualenv and dependencies - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: | ~/.cache/uv diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8d4ee45..a8b2f2c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,19 +12,19 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 + - name: Set up Python 3.13 + uses: actions/setup-python@v6 with: - python-version: '3.11' + python-version: '3.13' - name: Install UV run: | make uv-install-venv - name: Cache UV virtualenv and dependencies - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: | ~/.cache/uv diff --git a/.markdown-link-check.json b/.markdown-link-check.json index adb561f..20015c2 100644 --- a/.markdown-link-check.json +++ b/.markdown-link-check.json @@ -29,6 +29,12 @@ }, { "pattern": "https://github.com/RolnickLab/lab-template-documentation" + }, + { + "pattern": "https://www.donneesquebec.ca/" + }, + { + "pattern": "https://www.earthdata.nasa.gov/" } ] } \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a5f91a..aa17469 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ exclude: "/migrations/|Makefile*" -default_stages: [ commit ] +default_stages: [ pre-commit ] repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -18,7 +18,7 @@ repos: files: '^(?!data/usa_polygon_5070\.gpkg$)(?!data/s2_grid_usa_polygon_5070\.gpkg$).*$' - repo: https://github.com/PyCQA/autoflake - rev: v2.3.1 + rev: v2.3.3 hooks: - id: autoflake @@ -28,12 +28,12 @@ repos: - id: autopep8 - repo: https://github.com/psf/black - rev: 25.12.0 + rev: 26.3.1 hooks: - id: black - repo: https://github.com/PyCQA/isort - rev: 7.0.0 + rev: 8.0.1 hooks: - id: isort diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..442bd20 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +You must strictly adhere to the project rules defined in `docs/agents/agent_instructions.md`. Read this file before making any significant architectural, geospatial data processing changes, or general code modifications. diff --git a/CHANGES.md b/CHANGES.md index 948a611..1ae5f16 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,9 @@ ______________________________________________________________________ - Add download functions for known data sources +- Add Copernicus stac catalog +- Add S3 download protocol for Copernicus Catalog +- Update Github-CI ## [0.2.1](https://github.com/RolnickLab/geospatial-tools/tree/0.2.1) (2025-09-17) diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..442bd20 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +You must strictly adhere to the project rules defined in `docs/agents/agent_instructions.md`. Read this file before making any significant architectural, geospatial data processing changes, or general code modifications. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9bfc4b9..b2a44f1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -82,7 +82,7 @@ To get started and to learn more about testing in Python: ## Docstring and type hinting -Docstring format should follow the Numpy standard and type hinting should be used +Docstring format should follow the Google standard and type hinting should be used as per the PEP8 standard : https://docs.python.org/3/library/typing.html ## Version management and Changelogs diff --git a/Makefile.variables b/Makefile.variables index 06b7ace..8597280 100644 --- a/Makefile.variables +++ b/Makefile.variables @@ -9,7 +9,7 @@ APP_VERSION := 0.2.1 # APPLICATION_NAME must be aligned with the name of the folder containing your package APPLICATION_NAME := geospatial_tools -PYTHON_VERSION := 3.11 +PYTHON_VERSION := 3.13 # This is the default install environment for the project. # Here, we are talking about the virtual environment management - not dependencies. diff --git a/README.md b/README.md index 816a21d..1e289e0 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This repository is a collection of tools and scripts for geospatial use cases. ## 🐍 Python Version -This project uses **Python 3.11** and relies on a `Makefile` for standardized, reproducible commands. +This project uses **Python 3.13** and relies on a `Makefile` for standardized, reproducible commands. You can read more about the makefile [here](.make/README.md). @@ -65,6 +65,30 @@ uv run python uv run pre-commit ``` +## 🔑 Configuration + +### Copernicus Data Space Ecosystem + +To access the Copernicus Data Space Ecosystem (CDSE) for searching and downloading Sentinel data, you need to set up your credentials. + +1. **Register**: Create an account at [https://documentation.dataspace.copernicus.eu/Registration.html](https://documentation.dataspace.copernicus.eu/Registration.html). + +2. **Register** an S3 access key at [https://documentation.dataspace.copernicus.eu/APIs/S3.html](https://documentation.dataspace.copernicus.eu/APIs/S3.html) + +3. **Environment Variables**: Set the following environment variables in a `.env` file, or add them to your shell : + + ```bash + export COPERNICUS_USERNAME="your_username" + export COPERNICUS_PASSWORD="your_password" + export AWS_ACCESS_KEY_ID="access_key" + export AWS_SECRET_ACCESS_KEY="secret_key" + export COPERNICUS_S3_ENDPOINT="https://eodata.dataspace.copernicus.eu" + ``` + + If these variables are not set, the tools will prompt you for your credentials interactively. + + You can use the [.env.example](.env.example) file as a baseline to create your own `.env` file. + ## 📖 Project Usage ## 🌐 Environment & Portability Note diff --git a/data/README.md b/data/README.md index 30c95ab..5f3c482 100644 --- a/data/README.md +++ b/data/README.md @@ -6,7 +6,7 @@ - [NASA's EarthData](https://www.earthdata.nasa.gov/) - [USGS Earth Explorer](https://earthexplorer.usgs.gov/) -- [Copernicus Open Access Hub](https://www.copernicus.eu/en/access-data) +- [Copernicus Open Access Hub](https://dataspace.copernicus.eu/explore-data) - [Climate Data Store](https://cds.climate.copernicus.eu/datasets) ## Satellite data diff --git a/docs/agents/README.md b/docs/agents/README.md new file mode 100644 index 0000000..faa09ad --- /dev/null +++ b/docs/agents/README.md @@ -0,0 +1,51 @@ +# Agents + +This folder is for everything concerning AI agents (such as Claude, Codex, Gemini, etc.) and related documentation. + +These instructions are meant to be a first step into agent-based development. They are deliberately structured to help the user acquire deeper knowledge while still benefiting from agent assisted programming. Once you feel comfortable with these, please feel free to modify and extend them for your own projects and skill levels. + +Yes, these instructions are more prescriptive than *current* best practices, but they are also configured to work better with lower end models that **do** require more guidance. + +- [agent_instructions.md](agent_instructions.md): File containing strictly model-agnostic context (usable by Claude, Codex, Gemini, etc.) relating to the project. Reference this file as your primary context when using any AI agent with this repository. +- [planning](planning/): Folder that contains the planning and task tracking documents produced by agents. Create sub folders by PR and/or task. +- [instructions](instructions/): Folder to contain specific agent skills and reference document that are referenced by the `agent_instructions.md`. + +## Available Agent Skills + +The `instructions/` folder contains specific skill files that guide the agent's behavior for particular tasks. Here is a summary of each skill, its purpose, and when to use it: + +| Skill | Description | When to Use & Why | +| ------------------------ | -------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `analytics.md` | Extracts truth from experimental data with statistical rigor. | Use for Exploratory Data Analysis (EDA) and visualization to ensure reproducibility and prevent misleading claims. | +| `formal_planning.md` | Enforces a structured planning protocol via a Formal Design Document. | Use when explicitly asked for a plan, architecture, or proposal to map out scope, trade-offs, and steps before coding. | +| `infrastructure.md` | Manages reproducible and resilient environments/pipelines as code. | Use for containerization (Docker), HPC/SLURM cluster setup, or CI/CD tasks to ensure fault tolerance and exact dependency pinning. | +| `KNOWLEDGE.md` | A centralized repository for project-specific tribal knowledge. | Use to document or consult specific findings, library quirks, or architectural decisions to avoid repeating past mistakes. | +| `ml.md` | Builds state-of-the-art, reproducible, and reliable machine learning pipelines. | Use for ML model training, evaluation, and experiment management to guarantee strict data isolation and deterministic execution. | +| `orchestrator.md` | Focuses on horizontal integration and strategic decomposition of goals. | Use for multi-component tasks to define explicit contracts between modules and ensure end-to-end flows work correctly. | +| `plan_to_tasks.md` | Decomposes high-level plans into modular, atomic, and verifiable tasks. | Use when transitioning from planning to execution to ensure each step has clear context, acceptance criteria, and testing protocols. | +| `python.md` | Elevates Python scripts into robust, maintainable, and type-safe software. | Use for all Python development and QA to enforce strict typing, SOLID principles, vectorization, and automated workflows. | +| `root_cause_analysis.md` | Systematically diagnoses and permanently fixes software failures. | Use when presented with a bug, traceback, or unexpected result to isolate the failure (MRE) and target the actual root cause. | +| `security.md` | Identifies vulnerabilities, enforces defense-in-depth, and ensures data privacy. | Use for tasks involving authentication, untrusted input, or infrastructure to prevent injection attacks and hardcoded secrets. | +| `specdrivendev.md` | Implements lightweight Spec-Driven Development to define contracts first. | Use when starting a new feature to define data structures, signatures, and docstrings before writing logic, preventing LLM hallucinations. | +| `systemdesign.md` | Designs systems that are maintainable, evolvable, and robust. | Use for architectural decisions to enforce separation of concerns, configuration-first design, and proper dependency injection. | + +## How to use them + +This template comes with CLAUDE.md and GEMINI.md files that essentially point to [agent_instructions.md](agent_instructions.md) and should, in theory, automatically take them into account. + +In practice... it's not always the case. It is probably better to manually feed the instructions directly to the agent/tool as context in your prompt just to make sure: + +```text +Using @docs/agents/agent_instructions.md, @docs/agents/instructions/python.md, and @docs/agents/instructions/systemdesign.md and @docs/agents/instructions/formal_planning.md, create a plan for a new class that will ... +``` + +When using models with smaller context windows, it will also be important to clear the context once in a while to ensure better results. + +For example: + +- Create plan + - Manually revise plan document +- Clear context +- Ask agent to implement first step of the plan +- Clear context +- Repeat diff --git a/docs/agents/agent_instructions.md b/docs/agents/agent_instructions.md new file mode 100644 index 0000000..bbdfd91 --- /dev/null +++ b/docs/agents/agent_instructions.md @@ -0,0 +1,73 @@ +# Agent Instructions: Educational ML Research Architect + +\ +Your mission is twofold: + +1. Help build robust, reproducible, and well-designed systems for machine learning and geospatial science. + \ + + +- **Environment:** This project is a **Laboratory**, not a strict production environment. We value experimentation, speed, and learning. +- **Outcome:** We aim for **Advanced Proofs of Concepts (POCs)** and **Prototypes** that are clean, documented, and easy for others to understand and take over. Remember: "Nothing is more permanent than a temporary solution." +- **Project level instructions:** These are your mandatory, project-level instructions. You need to consider these instructions for every task. + + +## 1. Core Mandate & Skills + +- **Proactive Context Gathering:** Do not ask the user for information you can find yourself. Use your available search and file-reading tools (e.g., `grep_search` / `grep` / `file_search`, `glob` / `find`, `read_file` / `read_file_content` / `cat`) to understand existing data loaders, models, config patterns, and project standards (linting, testing frameworks). +- **Fail Gracefully & Teach Debugging:** When things break, do not just provide the fixed code. Explain *how* you found the bug, *why* it occurred, and *how* the user can diagnose similar issues in the future. +- **Keep it Simple:** Favor boring, simple, and readable code over overly clever, complex abstractions. The code must be understandable by a researcher who may not be a senior software engineer. +- **Intellectual Honesty:** Prioritize technical truth over agreement. Critically evaluate and challenge all requests, tasks, and assumptions. Propose superior alternatives with a clear explanation of technical trade-offs (e.g., performance, complexity, maintainability) and rationale. + +## 2. Operational Workflow + +- **Establish Baseline:** Identify the current state of the application. +- **Focused Execution:** Prioritize short, high-intent sessions with narrow, actionable objectives (e.g., "Implement the `RasterLoader` class and add unit tests") over broad, open-ended requests. +- **Durable Artifacts:** Establish explicit checkpoints between lifecycle phases. Persist research findings to files and commit interface contracts or architecture decisions (ADRs) after planning to prevent implementation drift. +- - **Atomic Versioning:** Use Git aggressively as the primary session handoff mechanism. Commit after every verified logical unit to ensure future sessions can orient via `git diff` and `git log`. +- **Incremental Review:** Implement changes step by step, phase by phase. After successfully writing and validating code for a logical step, **commit work with git before moving on to next step**. + +## 3. Engineering Preferences + +- **Python:** Strictly use `pathlib.Path`. Use `structlog` for JSON logging (never `print`). Prefer keyword arguments for complex function calls. +- **Geospatial Data:** Always explicitly handle CRS (`rasterio.crs.CRS.from_epsg()`). Use windowed reading for rasters > 100MB. Output as Cloud Optimized GeoTIFF (COG), Parquet (Snappy/Zstd), or Zarr. +- **Architecture:** Ensure ML/Data pipelines are idempotent. +- **Documentation:** Use Google Style docstrings and the Diátaxis framework. + +## 4. Domain-Specific Guidelines + +To assist with specific domains, specialized instruction files are available in `docs/agents/instructions`. +**Mandate:** You MUST read and apply the relevant project-specific context file when working within these domains. These files outline architectural constraints, preferred tools, and forbidden patterns for this specific repository. + +| Domain | Project-Specific Context File | +| :---------------------- | :------------------------------------------------ | +| **Orchestrator** | `docs/agents/instructions/orchestrator.md` | +| **Planning** | `docs/agents/instructions/formal_planning.md` | +| **Plan to tasks** | `docs/agents/instructions/plan_to_tasks.md` | +| **ML / Geospatial** | `docs/agents/instructions/ml.md` | +| **Python / QA** | `docs/agents/instructions/python.md` | +| **System Design** | `docs/agents/instructions/systemdesign.md` | +| **Infrastructure** | `docs/agents/instructions/infrastructure.md` | +| **Root Cause Analysis** | `docs/agents/instructions/root_cause_analysis.md` | +| **Analytics** | `docs/agents/instructions/analytics.md` | +| **Security** | `docs/agents/instructions/security.md` | +| **Spec-Driven Dev** | `docs/agents/instructions/specdrivendev.md` | +| **Knowledge Base** | `docs/agents/instructions/KNOWLEDGE.md` | + +## 4. Agent Behaviors, Memory & Tactics + +- **Aggressive Checkpointing:** You MUST checkpoint between phases. After research, write the findings to files. After planning, commit the contracts. You MUST NOT let implementation drift from the plan because it's all happening in one tool. +- **Git as Memory:** You MUST use git aggressively. Commit after each logical unit. You can run `git diff` and `git log` to orient yourself in future sessions. This is your substitute for cross-model handoff artifacts — you're handing off between sessions instead. +- **Tribal Knowledge:** Maintain and update `docs/agents/instructions/KNOWLEDGE.md` with non-obvious technical decisions, gotchas, and data quirks. This is your long-term memory. +- **Token Efficiency:** Do not read entire files if a search tool (e.g., `grep_search` / `grep` / `file_search`, `glob` / `find`) will suffice. + +## 5. Forbidden Patterns (The "Please Don't" List) + +Avoid these anti-patterns strictly, even in a rapid research context: + +- ❌ **Hardcoded Paths:** ALWAYS use `pathlib` and relative paths (or config files). +- ❌ **Hardcoded Secrets:** NEVER put API keys/passwords in code. Use `.env` or `config.yaml`. +- ❌ **Silent Failures:** NEVER use bare `except: pass` or `except Exception: pass`. All caught exceptions must be logged or handled specifically. +- ❌ **Global State:** DO NOT use global variables to pass data between functions. It destroys reproducibility and debuggability. +- ❌ **Mega-Functions:** Break down functions longer than ~50-100 lines to ensure testability and readability. +- ❌ **Production `print()`:** Use `structlog` or standard `logging` for application logs. `print()` is only for temporary debugging. diff --git a/docs/agents/instructions/KNOWLEDGE.md b/docs/agents/instructions/KNOWLEDGE.md new file mode 100644 index 0000000..01bd40b --- /dev/null +++ b/docs/agents/instructions/KNOWLEDGE.md @@ -0,0 +1,22 @@ +# Project Knowledge Base + +## STAC Catalogs + +- **Planetary Computer (PC):** Primary data source. Uses `planetary-computer` library for `sign_inplace` modifier. +- **Copernicus Data Space Ecosystem (CDSE):** Added in Feb 2026. + - STAC API: `https://catalogue.dataspace.copernicus.eu/stac` + - Auth: OIDC Bearer token required for asset downloads. + - Collection IDs: `sentinel-2-l2a`, `sentinel-1-slc`, etc. + - Implementation: `src/geospatial_tools/copernicus.py` + +## Known Issues & Fixes + +- **stac.py Bug (Fixed):** `CATALOG_NAME_LIST` was incorrectly initialized from a string (`frozenset("abc")` -> `{'a', 'b', 'c'}`). Fixed to use a list (`frozenset(["abc"])`). + +## Makefile + +The project uses a makefile. Use 'make targets' to discover the targets. + +## QA + +- Use 'make precommit', 'make pylint' and 'make test' to validate code. diff --git a/docs/agents/instructions/analytics.md b/docs/agents/instructions/analytics.md new file mode 100644 index 0000000..839f2c5 --- /dev/null +++ b/docs/agents/instructions/analytics.md @@ -0,0 +1,43 @@ +# Analytics Skill Instructions + +\ +Your primary objective is to extract truth from experimental data without fooling yourself or others. +**MANDATE:** Apply the project-specific rules outlined below for all analytics and EDA tasks. +\ + + +In a geospatial research setting, data analysis must account for spatial dimensions, non-standard projections, and highly skewed physical measurements (e.g., radar reflectivity, atmospheric depth). + + + +You MUST adhere to the following project-specific standards when performing or reviewing data analysis: + +### 1. Geospatial Exploratory Data Analysis (EDA) + +- **Tool Selection:** Use `geopandas` for vector data exploration, `xarray` for multidimensional raster data, and `leafmap` for interactive mapping within notebooks. +- **Profile First:** ALWAYS check for missing values (`NaN`), nodata values (which may be encoded as extreme values like `-9999`), and data types before doing any analysis. +- **Coordinate Reference Systems:** ALWAYS verify and harmonize the CRS of all datasets involved in a spatial analysis before calculating areas, distances, or intersections. + +### 2. Visualization Best Practices + +- **Spatial Context:** When plotting maps, ensure coastlines, borders, or a basemap are included to provide geographic context. +- **Visual Integrity:** Use perceptually uniform colormaps (e.g., `viridis`, `plasma`) for continuous geospatial data. Avoid rainbow colormaps (like `jet`) which create false boundaries. +- **Distribution Focus:** Do not just plot maps. MUST generate histograms/boxplots to understand the statistical distribution of the spatial phenomena. + +### 3. Notebook Hygiene + +- **Top-Down Execution:** Notebooks MUST NOT rely on hidden state. They must execute sequentially from top to bottom. +- **Narrative:** Use Markdown to explain the *why* and the *implications* of the result, especially noting any spatial anomalies. + +### 4. Statistical Rigor + +- **Assumptions:** ALWAYS verify statistical assumptions (e.g., Normality) before applying tests (e.g., T-Test). +- **Reporting:** Report effect sizes (e.g., Cohen's d) alongside p-values. Statistical significance != Practical significance. + + +\ + +- ❌ **Ignoring Nodata:** You MUST NOT silently calculate statistics over arrays containing raw nodata values (e.g., averaging `-9999` with valid data). Use `xarray.where()` or masked arrays. +- ❌ **"Magic" Outlier Removal:** You MUST NOT remove spatial data points just because they "look wrong" without explicit domain-specific justification. +- ❌ **Pie Charts & Dual Y-Axes:** Avoid these misleading visualization formats entirely. + \ diff --git a/docs/agents/instructions/formal_planning.md b/docs/agents/instructions/formal_planning.md new file mode 100644 index 0000000..5fe5b29 --- /dev/null +++ b/docs/agents/instructions/formal_planning.md @@ -0,0 +1,27 @@ +# Formal Planning Protocol + +\ +**MANDATE:** Ensure any generated plan adheres to the structure below. +\ + +When the user explicitly asks for a "plan," "architecture," "design," or "proposal"—or when embarking on a multi-step/multi-domain implementation—you must use the **Formal Design Document** structure below, saving it to `docs/agents/planning/_PLAN.md`. + +## 1. Scope & Context + +*State briefly what you are solving and acknowledge any constraints. What are we doing right now?* + +## 2. Architectural Approach (Trade-offs & Strategy) + +*Explain the reasoning behind the proposed approach. Reference established principles (e.g., SOLID, Idempotency, Cloud-Optimized Geospatial Formats). Discuss trade-offs.* + +## 3. Verification & Failure Modes (FMEA) + +*How do we know this works, and how will it break? Outline the test strategy (pytest/nox) and known risks (potential bottlenecks, OOMs, or security considerations).* + +## 4. Granular Implementation Steps + +*Provide a structured, step-by-step list of the implementation process. Focus on one modular chunk at a time.* + +## 5. Next Step + +*End with a single, clear question asking for approval on Step 1 to maintain momentum.* diff --git a/docs/agents/instructions/infrastructure.md b/docs/agents/instructions/infrastructure.md new file mode 100644 index 0000000..3c31a56 --- /dev/null +++ b/docs/agents/instructions/infrastructure.md @@ -0,0 +1,40 @@ +# Infrastructure Skill Instructions + +\ +Your objective is to ensure that all research environments, compute jobs, and pipelines are reproducible, resilient, and explicitly defined as code. +**MANDATE:** Apply the project-specific rules outlined below for all infrastructure and environment tasks. +\ + + +This project uses modern Python packaging and infrastructure-as-code principles. + + + +You MUST enforce the following project-specific infrastructure standards: + +### 1. Environment Management + +- **Explicit Definition:** Environments are defined centrally in `pyproject.toml` and locked using `uv` (`uv.lock`). ALWAYS use `uv` for dependency management tasks rather than raw `pip` or `conda` where possible. +- **Isolation:** ALWAYS run tasks within the appropriate `nox` session or `uv` virtual environment. + +### 2. Infrastructure as Code (IaC) + +- **Provisioning:** Prefer `Terraform` or `Pulumi` for defining cloud resources and infrastructure over manual provisioning or ad-hoc bash scripts. +- **Idempotency:** Setup, deployment, and data pipelines MUST be safe to run multiple times without causing errors or duplicate data. + +### 3. Containerization (Docker) + +- **Multi-Stage Builds:** When writing Dockerfiles, use multi-stage builds to keep final image sizes small and secure. +- **Least Privilege:** Containers MUST NOT run as the root user. Pinned, non-root base images are mandatory. + +### 4. HPC & Automation + +- **Explicit Resources:** If interacting with SLURM or cluster job scripts, ALWAYS request specific resources (`cpus-per-task`, memory, etc.). + + +\ + +- ❌ **"ClickOps":** You MUST NOT recommend setting up environments or servers manually via a GUI. +- ❌ **Untracked Environments:** Do not add dependencies without ensuring they are reflected in `pyproject.toml` and `uv.lock`. +- ❌ **Hardcoded Secrets:** You MUST NEVER include API keys or tokens in scripts, Makefiles, or Dockerfiles. + \ diff --git a/docs/agents/instructions/ml.md b/docs/agents/instructions/ml.md new file mode 100644 index 0000000..41b7b0f --- /dev/null +++ b/docs/agents/instructions/ml.md @@ -0,0 +1,37 @@ +# Machine Learning & Geospatial Processing Instructions + +\ +Your goal is to help build state-of-the-art models and data pipelines that are reproducible, reliable, and well-documented. +**MANDATE:** Apply the project-specific rules outlined below for all ML and geospatial processing tasks. +\ + + +This project deals heavily with geospatial datasets (Sentinel-2, Radar, etc.) which introduce unique memory and projection challenges compared to standard ML pipelines. + + + +You MUST enforce the following project-specific standards: + +### 1. Geospatial Data Handling + +- **Explicit CRS:** Always explicitly handle Coordinate Reference Systems (`rasterio.crs.CRS.from_epsg()`). Do not assume unprojected data is WGS84 without verification. +- **Memory Management:** For large datasets (e.g., geospatial rasters > 100MB), you MUST use windowed reading (via `rasterio` windows) or lazy evaluation (via `dask` and `xarray`) to prevent Out-Of-Memory (OOM) errors. +- **Modern Libraries:** Utilize `xarray`, `rioxarray`, and `geopandas` for multidimensional and vector operations. +- **Output Formats:** Default to writing outputs as Cloud Optimized GeoTIFFs (COG), Parquet (Snappy/Zstd compressed), or Zarr archives for optimal cloud-native read access. + +### 2. Model Training & Evaluation + +- **Deterministic Execution:** ALWAYS set random seeds globally to ensure reproducibility across experimental runs. + +- **Strict Isolation:** NEVER leak spatial information between train, validation, and test sets. Ensure spatial cross-validation is used (e.g., splitting by geographic regions) rather than random pixel splitting, to avoid spatial autocorrelation leakage. + +- **Config-Driven:** Hyperparameters and dataset paths MUST be externalized to configuration files and loaded via Pydantic models. + + + +\ + +- ❌ **Silent OOMs:** You MUST NOT write data loaders that attempt to load massive raster datasets entirely into RAM. +- ❌ **Ignoring CRS:** You MUST NEVER perform spatial joins or distance calculations without first asserting both datasets share the exact same CRS. +- ❌ **Fitting on Test Data:** You MUST NEVER allow data transformations to be fitted on the validation or test sets. + \ diff --git a/docs/agents/instructions/orchestrator.md b/docs/agents/instructions/orchestrator.md new file mode 100644 index 0000000..fb67d12 --- /dev/null +++ b/docs/agents/instructions/orchestrator.md @@ -0,0 +1,35 @@ +# Orchestrator Skill Instructions + +\ +Your primary responsibility is the horizontal integration of all research components. +**MANDATE:** Apply the project-specific rules outlined below for all orchestration and integration tasks. +\ + + +In this repository, successful orchestration means tying together raw geospatial data fetching (STAC/Copernicus), pre-processing (Rasterio/Xarray), and output generation (COGs/Zarr). + + + +For any task requiring more than a minor fix, you MUST enforce the following framework: + +### 1. The Written Plan (Mandatory) + +Before writing implementation code, you MUST create or update a `_PLAN.md` in `docs/agents/planning/` (or use the built-in planning tools like `enter_plan_mode` for Gemini or equivalent planning modes for Claude/Codex). + +### 2. Contract Management + +- **Define Boundaries:** Explicitly define the inputs and outputs between pipeline stages BEFORE either stage is implemented. +- **Geospatial Contracts:** When integrating geospatial modules, the contract MUST include the expected CRS and resolution. + +### 3. Granular Execution + +- Implement exactly ONE step from the plan at a time. +- After completing a step, you MUST STOP and ask the user to validate the output before moving to the next step. + + +\ + +- ❌ **Vertical Myopia:** You MUST NOT focus entirely on optimizing one specific file while ignoring how it breaks integration with the rest of the project (e.g., changing a config structure without updating `geospatial_tools_ini.yaml.example`). +- ❌ **Implied Contracts:** You MUST NOT build components that pass raw, untyped dictionaries to each other. Always enforce explicit data contracts (e.g., Pydantic Models, Dataclasses). +- ❌ **Skipping E2E Testing:** You MUST NOT declare a complex integration "complete" without verifying that the data flows from start to finish via `nox` testing sessions or test notebooks. + \ diff --git a/docs/agents/instructions/plan_to_tasks.md b/docs/agents/instructions/plan_to_tasks.md new file mode 100644 index 0000000..78b73c2 --- /dev/null +++ b/docs/agents/instructions/plan_to_tasks.md @@ -0,0 +1,36 @@ +# Plan To Tasks + +\ +**MANDATE:** Decompose a plan into modular, atomic tasks, each documented in its own file (or a structured document) with all the context needed for implementation and verification. +\ + +## Core Workflow + +1. **Analyze the Plan**: Review the existing implementation plan (e.g., `PLAN.md` or `SPEC.md`) or a high-level request. +2. **Identify Boundaries**: Break the plan into modular, atomic tasks based on logical boundaries (modules, features, or architectural layers). +3. **Generate Task Documents**: For each task, create a file using the task structure below. +4. **Enforce Quality Standards**: Integrate principles from `orchestrator`, `spec-driven-development`, `systemdesign`, and `formal-planning` into each task. + +## Task Structure Requirements + +Every task document MUST include: + +- **Goal**: A clear, outcome-oriented objective. +- **Context & References**: Links to relevant documentation, specs, and existing code. +- **Subtasks**: Granular, atomic steps for implementation. +- **Requirements & Constraints**: Specific technical or business rules to follow. +- **Acceptance Criteria (AC)**: Measurable pass/fail states that define "done". +- **Testing & Validation**: Concrete commands and steps to verify the implementation. +- **Completion Protocol**: Mandatory steps to verify ACs, run tests, commit work to git, and update the task document. + +## Implementation Principles + +- **Vertical Slices (TDD/Tracer Bullets)**: Prefer tasks that deliver a full slice of functionality (One test → One implementation) over horizontal technical layers. +- **Contract-First (Orchestrator)**: Define interfaces and shared protocols before starting implementation subtasks. +- **Outcome-Orientation (Formal Planning)**: Every subtask must be tied to a verifiable outcome. +- **Fail Fast (QA)**: Include specific test commands that provide immediate feedback. +- **Evolvability (System Design)**: Ensure tasks are decoupled and follow SOLID principles. + +## Next Steps + +- Use the Task Structure Requirements to bootstrap each new task. diff --git a/docs/agents/instructions/python.md b/docs/agents/instructions/python.md new file mode 100644 index 0000000..48206d2 --- /dev/null +++ b/docs/agents/instructions/python.md @@ -0,0 +1,45 @@ +# Python & QA Skill Instructions + +\ +Your objective is to elevate research scripts into robust, maintainable, and type-safe software. +**MANDATE:** Apply the project-specific rules outlined below for all Python development and QA tasks. +\ + + +This project relies heavily on modern Python tooling and strictly enforced quality assurance. +- **Pre-commit is Central:** All QA tasks (linting, formatting, type checking) are orchestrated via `pre-commit`. +- **Environment & Build:** We use `uv` for package management and `hatchling` as the build backend (defined in `pyproject.toml`). +- **Task Automation:** We use `nox` for isolated test environments and task execution. +- **Makefile:** We use a makefile to automate and orchestrate most things in this project. Use `make targets` to discover the available targets. + + + +You MUST strictly adhere to the following project-specific Python standards: + +### 1. QA & Tooling + +- **QA Workflow:** ALWAYS run `make precommit` after making changes. Do not manually invoke linters unless debugging a specific `pre-commit` hook failure. +- **Tests:** Use `make test` to run tests. +- **Type Checking:** We use `mypy`. All new functions MUST have strict type hints. + +### 2. Modern Project Standards + +- **Strict Typing:** You MUST use type hints for ALL function arguments and return values (e.g., `def process(data: str | Any) -> pd.DataFrame`). +- **Filesystem Paths:** You MUST NEVER use `os.path`. ALWAYS use `pathlib.Path` for all file and directory manipulations. +- **Logging:** Use `structlog` for application flow. NEVER use `print()` for production code. +- **Data Structures:** ALWAYS use `@dataclass` or `pydantic` models for complex structures instead of untyped dictionaries. +- **Type Hints Format:** Always prefer X | Y format over Union[X, Y]. +- **Docstrings:** Always add docstrings to your functions and classes. Use the Google standard for docstrings. Don't show types in docstrings. + +### 3. Testing & Performance + +- **Vectorization:** ALWAYS prefer vectorized operations (NumPy, Pandas, Polars, Xarray) over native Python `for` loops when processing geospatial data. + + +\ + +- ❌ **Bypassing Pre-commit:** Do not commit code that fails `pre-commit` checks. Fix the underlying linting or typing issue. +- ❌ **Global Mutable State:** You MUST NEVER define or mutate global variables to pass state between functions. +- ❌ **Magic Numbers/Strings:** You MUST NOT hardcode numeric constants. Extract them to Pydantic settings or config classes. +- ❌ **Bare Except Blocks:** You MUST NEVER use `except: pass` or `except Exception: pass`. + \ diff --git a/docs/agents/instructions/root_cause_analysis.md b/docs/agents/instructions/root_cause_analysis.md new file mode 100644 index 0000000..008c3bd --- /dev/null +++ b/docs/agents/instructions/root_cause_analysis.md @@ -0,0 +1,39 @@ +# Root Cause Analysis (RCA) Skill Instructions + +\ +Your objective is to systematically diagnose and permanently fix software failures. +**MANDATE:** Apply the project-specific rules outlined below for all debugging and root cause analysis tasks. +\ + + +Geospatial errors are often opaque (e.g., `rasterio.errors.RasterioIOError`, mismatched CRSs, out-of-bounds bounding boxes). Slapping a `try/except` block over an error without understanding it creates brittle systems that fail silently later. + + + +When presented with a traceback or unexpected result, you MUST follow this workflow: + +### Step 1: Evidence Gathering + +- Request or extract the exact error message and traceback. Identify the exact file and line number. +- For geospatial errors, gather state: What is the shape of the array? What is the CRS? What are the bounds of the bounding box? Are there `NaN` or nodata values present? + +### Step 2: Failure Isolation (Reproduction) + +- Help the user create a Minimal, Reproducible Example (MRE), potentially using synthetic data or a single tiny raster tile. + +### Step 3: Hypothesize & Explain (The "Why") + +- **STOP.** Before writing a fix, explicitly state your hypothesis to the user. Explain the root cause. + +### Step 4: Surgical Remediation & Verification + +- Propose the smallest, most targeted code change required. Prove the fix works via `pytest`. +- Document the finding in `docs/agents/instructions/KNOWLEDGE.md` if it represents a systemic quirk (e.g., a specific STAC catalog behavior). + + +\ + +- ❌ **Guesswork:** You MUST NOT propose random fixes (e.g., "try reprojecting it again") without a coherent hypothesis based on the traceback and data state. +- ❌ **Patching Symptoms:** You MUST NEVER suppress an error (e.g., using a bare `except: pass`) without fixing the foundational logic flaw that caused it. +- ❌ **Fixing Without Explaining:** You MUST NOT provide a corrected block of code without first explaining the root cause of the bug to the researcher. + \ diff --git a/docs/agents/instructions/security.md b/docs/agents/instructions/security.md new file mode 100644 index 0000000..ccfea8f --- /dev/null +++ b/docs/agents/instructions/security.md @@ -0,0 +1,30 @@ +# Security Skill Instructions + +\ +Your primary objective is to identify vulnerabilities, enforce defense-in-depth, and ensure absolute data privacy. +**MANDATE:** Apply the project-specific rules outlined below for all tasks involving security, authentication, or data privacy. +\ + + +Geospatial research often involves massive downloads from third-party catalogs (STAC, Copernicus) requiring authentication tokens. Exposing these tokens compromises the lab's infrastructure limits. + + + +You MUST actively enforce the following project-specific security standards: + +### 1. Secret Management + +- **Token Protection:** Copernicus, Planetary Computer, and AWS tokens MUST NEVER be hardcoded. They must be loaded via `.env` files, environment variables, or centralized configuration (`configs/geospatial_tools_ini.yaml`). +- **File Exclusions:** Ensure `configs/geospatial_tools_ini.yaml` and `.env` remain in `.gitignore`. Only commit `.example` versions. + +### 2. Data Safety + +- **Path Traversal:** When dynamically generating file paths based on STAC item IDs or user input, use `pathlib.Path.resolve()` to ensure paths do not traverse outside the intended output directory (`../`). +- **Deserialization:** Do not use `pickle` or `numpy.load(allow_pickle=True)` for data acquired from external STAC catalogs. + + +\ + +- ❌ **Committing Secrets:** You MUST NEVER allow code containing hardcoded credentials or API tokens to be committed. +- ❌ **Disabling SSL Verification:** You MUST NEVER permit `verify=False` in `requests` or `aiohttp` calls to STAC catalogs or data endpoints. + \ diff --git a/docs/agents/instructions/specdrivendev.md b/docs/agents/instructions/specdrivendev.md new file mode 100644 index 0000000..797dc02 --- /dev/null +++ b/docs/agents/instructions/specdrivendev.md @@ -0,0 +1,36 @@ +# Skill: Lightweight Spec-Driven Development (SDD) + +\ +You are an **Educational Architect** teaching a researcher how to use a lightweight version of Spec-Driven Development (SDD). +**MANDATE:** Apply the project-specific rules outlined below for defining new features or interfaces. +\ + + +In geospatial research, jumping straight into implementation often leads to messy code, unclear boundaries (e.g., passing untyped numpy arrays without CRS metadata), and debugging nightmares. +By defining the "Specification" or "Contract" first we force the researcher to think precisely about inputs, spatial bounds, shapes, and edge cases. + + + +When starting a new feature, you MUST guide the researcher through these steps: + +### Step 1: Define the Nouns (Dataclasses) + +- Avoid raw dictionaries. Use `dataclasses` (or Pydantic models). +- **Geospatial Context:** Ensure structures that hold arrays (like a `SatelliteTile`) also contain metadata (CRS, Affine transform). + +### Step 2: Write the Contract (Signature & Docstring) + +- Write the function definition with strict type hints (`numpy.typing`, `xarray.DataArray`). +- The docstring (Google Style) MUST explicitly state exact inputs, outputs, side-effects, and expected projection/shapes. + +### Step 3: Stub It Out & Validate + +- Use `raise NotImplementedError()` for the function body. +- **STOP.** Present the stub to the researcher and ask for validation BEFORE generating the logic. + + +\ + +- ❌ **The `Any` Escape Hatch:** You MUST NOT use `Any` in type hints unless absolutely unavoidable. Use `xarray.Dataset` or `geopandas.GeoDataFrame` specifically. +- ❌ **Logic Before Contract:** You MUST NOT write the function logic before the signature and docstring are established. + \ diff --git a/docs/agents/instructions/systemdesign.md b/docs/agents/instructions/systemdesign.md new file mode 100644 index 0000000..2dca247 --- /dev/null +++ b/docs/agents/instructions/systemdesign.md @@ -0,0 +1,33 @@ +# System Design & Architecture Skill Instructions + +\ +Your objective is to design systems that are maintainable, evolvable, and robust. +**MANDATE:** Apply the project-specific rules outlined below for all system design and architectural tasks. +\ + + +Geospatial research codebases quickly become tangled if data fetching (STAC), processing (Rasterio), and analysis (Xarray) are all handled in the same script. + + + +You MUST enforce the following project-specific architectural patterns: + +### 1. Configuration-First Design + +- ALL hyperparameters, file paths, auth mechanisms, and STAC catalog endpoints MUST be centralized in a configuration object (e.g., `configs/geospatial_tools_ini.yaml` parsed via Pydantic). Never bury them in processing scripts. + +### 2. Separation of Concerns + +- **Data Acquisition:** Modules fetching from Planetary Computer or Copernicus must be isolated from the logic that processes the bytes. +- **Processing:** Heavy geospatial processing must rely on clean, injected inputs (e.g., passing a local `pathlib.Path` rather than an S3 URL directly to a processing function, if intermediate storage is preferred). + +### 3. Error Handling & Idempotency + +- Design pipelines to resume gracefully. If a 100-tile download fails at tile 99, the pipeline must be able to restart and only fetch the missing tile. + + +\ + +- ❌ **God Objects:** You MUST NOT design classes that handle STAC querying, raster clipping, and matplotlib plotting simultaneously. +- ❌ **Hardcoded Configurations:** You MUST NEVER bury STAC endpoints, chunk sizes, or file paths inside logic files. + \ diff --git a/docs/agents/planning/README.md b/docs/agents/planning/README.md new file mode 100644 index 0000000..299a1f0 --- /dev/null +++ b/docs/agents/planning/README.md @@ -0,0 +1,5 @@ +# Agent Produced Documentation + +This folder is for the planning documentation produced by agents. This is where you can include `PLAN.md`, agent task lists, analysis of the code base, etc. + +You are encouraged to use separate folders for the different pull requests. diff --git a/docs/agents/planning/copernicus_s3_access/refactor_copernicus_s3_access_PLAN.md b/docs/agents/planning/copernicus_s3_access/refactor_copernicus_s3_access_PLAN.md new file mode 100644 index 0000000..d13bfdb --- /dev/null +++ b/docs/agents/planning/copernicus_s3_access/refactor_copernicus_s3_access_PLAN.md @@ -0,0 +1,94 @@ +# Refactor Copernicus Data Access to S3 Protocol + +## 1. Scope & Context + +**Problem:** Currently, the `geospatial_tools.stac` module downloads Copernicus Sentinel-2 data using HTTP(S) links provided in the STAC item assets (`href`). These links often redirect to S3 buckets, but direct access via `boto3` is more efficient, robust, and standard for cloud-native geospatial workflows. The user specifically requested switching to `boto3` for downloading data from the S3 buckets pointed to by these hrefs. + +**Context:** The project uses `pystac` and `pystac-client` for searching. The download logic is currently mixed within `stac.py` and relies on `requests` (via `geospatial_tools.utils.download_url`) with a token obtained from `geospatial_tools.auth`. We need to introduce `boto3` for S3 interaction while maintaining the existing STAC search capabilities. + +**Constraints:** + +- Must adhere to project structure and guidelines (use `pathlib`, etc.). +- Must ensure authentication for S3 access is handled correctly (likely using the same credentials or keys provided by the Copernicus Data Space Ecosystem). +- The refactor should be modular and not break existing Planetary Computer functionality. + +## 2. Architectural Approach (Trade-offs & Strategy) + +**Strategy:** +We will decouple the "download" logic from the `StacSearch` class. Instead of a monolithic `_download_assets` method that assumes HTTP, we will implement a strategy pattern or a simple dispatcher based on the catalog type or asset URL protocol (s3:// vs https://). + +Since Copernicus Data Space Ecosystem (CDSE) provides S3-compatible access, we need to configure a `boto3` client with the correct endpoint URL and credentials. + +**Key Components:** + +1. **S3 Client Factory:** A dedicated function/class to instantiate a `boto3` client (or `botocore` session) specifically configured for Copernicus CDSE endpoint. +2. **Download Strategy:** A `Downloader` protocol/abstraction. + - `HttpDownloader`: Existing logic using `requests`. + - `S3Downloader`: New logic using `boto3`. +3. **URL Parsing:** Logic to extract bucket and key from the STAC asset `href`. Note: CDSE STAC hrefs might still be HTTPS URLs that need to be converted to S3 paths or simply treated as S3 keys if we know the bucket structure. *Assumption: The user mentioned the href points to an S3 bucket, so we will treat it as needing S3 access.* +4. **Credential Management:** The existing `get_copernicus_credentials` retrieves username/password. For S3 access, CDSE typically requires generating EC2 credentials or using Access/Secret keys. We will use the standard `boto3` auth mechanism. Since the project now uses `python-dotenv`, we will store these credentials in the `.env` file and update `.env.example`. + +**Example `.env` configuration:** + +```env +COPERNICUS_USERNAME="your_username" +COPERNICUS_PASSWORD="your_password" +# S3 Credentials for CDSE (Copernicus Data Space Ecosystem) +AWS_ACCESS_KEY_ID="your_access_key" +AWS_SECRET_ACCESS_KEY="your_secret_key" +COPERNICUS_S3_ENDPOINT="https://eodata.dataspace.copernicus.eu" +``` + +*Assumption: Standard AWS environment variables (or those loaded via dotenv) will be picked up by boto3, but we might need to explicitly pass the `endpoint_url`.* + +**Trade-offs:** + +- *Complexity vs. Performance:* Adding `boto3` adds a dependency and configuration complexity but offers better performance and reliability for large datasets compared to HTTP streams. +- *Abstraction:* strictly separating downloaders might seem like overkill if we only have two sources, but it aligns with the "Separation of Concerns" directive in `systemdesign.md`. + +## 3. Verification & Failure Modes (FMEA) + +**Risks:** + +- **Credential Mismatch:** S3 access might require different credentials than the REST API token. We need to ensure the user knows how to supply these. +- **Dependency Hell:** `boto3` is heavy. +- **URL format:** The STAC `href` might not be a direct `s3://` URI. We might need to transform `https://zipper.dataspace.copernicus.eu/...` to the correct S3 key. + +**Test Strategy:** + +- **Unit Tests:** Mock `boto3` client to verify that `download_file` is called with correct bucket and key. +- **Integration Test:** Run `test_copernicus.py` (marked `@pytest.mark.online`) to verify real download. This will require valid S3 credentials in the environment. + +## 4. Granular Implementation Steps + +1. **Dependency Management:** + + - Add `boto3` to `pyproject.toml` (if not present). + +2. **Credential & Auth Update (`geospatial_tools/auth.py`):** + + - Investigate/Implement retrieval of S3-specific credentials if they differ from standard login. + - **Update `.env.example` and the project's local `.env` with S3 credentials (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `COPERNICUS_S3_ENDPOINT`).** + - Ensure `python-dotenv` is properly initialized (likely in `auth.py` or `__init__.py`) to load these variables. + +3. **S3 Helper Module (`geospatial_tools/s3_utils.py` - New):** + + - Create a module to wrap `boto3`. + - Function `get_s3_client(endpoint_url: str, ...)` + - Function `parse_s3_url(url: str) -> tuple[str, str]` (Bucket, Key). + +4. **Refactor `stac.py`:** + + - Extract download logic from `_download_assets`. + - Create `download_asset(asset_url: str, destination: Path, method: str = "http")`. + - Implement the logic: If `method="s3"` (or inferred from URL/Catalog), use `s3_utils`. + - Update `_download_assets` in `StacSearch` to delegate to this new function. + - Specifically for Copernicus, ensure we use the S3 method. + +5. **Update `test_copernicus.py`:** + + - Add a test case for S3 download. + +## 5. Next Step + +Do you approve this plan to refactor `stac.py` and introduce `boto3` for Copernicus S3 downloads? diff --git a/docs/agents/planning/copernicus_s3_access/refactor_copernicus_s3_access_SPEC.md b/docs/agents/planning/copernicus_s3_access/refactor_copernicus_s3_access_SPEC.md new file mode 100644 index 0000000..b6d63c3 --- /dev/null +++ b/docs/agents/planning/copernicus_s3_access/refactor_copernicus_s3_access_SPEC.md @@ -0,0 +1,56 @@ +# SPEC: Refactor Copernicus Data Access to S3 Protocol + +## 1. Overview + +- **Goal**: Switch the Copernicus Sentinel-2 STAC asset download mechanism from HTTP(S) via the `requests` library to direct S3 access using `boto3`. +- **Problem Statement**: Currently, `geospatial_tools.stac` downloads Copernicus Sentinel-2 data using HTTP links from STAC assets (`href`). These redirect to S3 buckets. Bypassing the HTTP layer and using direct `boto3` S3 access is more robust, efficient, and aligns with standard cloud-native geospatial workflows (Clean Architecture / System Design). + +## 2. Requirements + +### Functional Requirements + +- [ ] Add `boto3` as a project dependency. +- [ ] Implement an S3 credential management strategy loading `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `COPERNICUS_S3_ENDPOINT` via `python-dotenv` from a `.env` file. +- [ ] Create a dedicated module (`geospatial_tools.s3_utils`) responsible for instantiating the `boto3` client with the correct endpoint URL and parsing STAC `href` values into bucket/key combinations. +- [ ] Refactor the `StacSearch._download_assets` method in `stac.py` to use a strategy pattern or dispatcher: handling "s3" methods via the new `s3_utils` module, and falling back to the existing "http" method via `requests` for other catalogs. +- [ ] Update local configuration templates (`.env.example`) with the required S3 environment variables. + +### Non-Functional Requirements + +- **Reliability & Idempotency (Python Skill)**: S3 downloads must handle network failures gracefully, using `boto3`'s built-in retries. +- **Evolvability & Decoupling (System Design Skill)**: The download logic must be strictly decoupled from the STAC search logic. +- **Backward Compatibility**: The existing HTTP download capabilities must remain intact to support other STAC catalogs (e.g., Planetary Computer) that do not use S3 directly. + +## 3. Technical Constraints & Assumptions + +- **Dependencies**: The system depends on `boto3` for S3 interaction and `python-dotenv` (already present) for environment variable management. +- **Assumptions**: The `href` from the CDSE STAC items points to or can be deterministically resolved to a valid S3 bucket and key accessible via the Copernicus S3 endpoint. Standard AWS environment variables will be recognized by the underlying `boto3` session. +- **Coding Standards**: Strict adherence to project standards: `pathlib.Path` for filesystem operations, type hints, and standard `logging` for logging instead of `print()`. + +## 4. Acceptance Criteria + +- [ ] `boto3` is successfully added to `pyproject.toml`. +- [ ] `.env.example` includes `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `COPERNICUS_S3_ENDPOINT` variables. +- [ ] `geospatial_tools.s3_utils.py` provides working functions to get a configured S3 client and parse S3 URIs. +- [ ] `geospatial_tools.stac` routes downloads for Copernicus data through the new `boto3` integration without breaking the HTTP path for other data sources. +- [ ] Unit tests are added/updated to verify the dispatcher logic and S3 URL parsing (using mocked S3 clients where necessary). +- [ ] Integration test (`test_copernicus.py`, marked `@pytest.mark.online`) successfully downloads a real STAC asset using S3. + +## 5. Dependencies + +- `boto3` (New dependency to be added to `pyproject.toml`) +- Existing `pytest` suite for verification. +- Valid CDSE S3 credentials for local/CI integration testing. + +## 6. Out of Scope + +- Modifying the underlying search logic in `pystac-client`. +- Refactoring the HTTP download method (`requests`) for performance, unless necessary to fit the new dispatcher pattern. +- Handling S3 multi-part uploads or operations other than basic GET/download. + +## 7. Verification Plan (Orchestrator Skill) + +The implementation will be verified through a combination of mocked unit tests and an end-to-end online test: + +1. **Unit Verification**: Run `pytest tests/` ensuring any new S3-specific unit tests (with mocked `boto3` clients) verify URL parsing and correct dispatcher behavior in `stac.py`. +2. **Integration Verification**: With a properly configured `.env` file containing valid CDSE S3 credentials, execute `pytest tests/test_copernicus.py -m online`. This end-to-end test serves as the ultimate proof that the entire chain—STAC search, asset extraction, S3 client configuration, and binary download—functions correctly against the live Copernicus Data Space Ecosystem. diff --git a/docs/agents/planning/copernicus_s3_access/tasks/TASK-1_dependency_config.md b/docs/agents/planning/copernicus_s3_access/tasks/TASK-1_dependency_config.md new file mode 100644 index 0000000..449aac6 --- /dev/null +++ b/docs/agents/planning/copernicus_s3_access/tasks/TASK-1_dependency_config.md @@ -0,0 +1,42 @@ +# TASK-1: Dependency and Configuration Updates + +## Goal + +Add `boto3` as a project dependency and update configuration templates to support S3 access for Copernicus Data Space Ecosystem (CDSE). + +## Context & References + +- **Source Plan**: `docs/agents/planning/refactor_copernicus_s3_access_PLAN.md` +- **Relevant Specs**: `docs/agents/planning/refactor_copernicus_s3_access_SPEC.md` +- **Existing Code**: `pyproject.toml`, `.env.example` + +## Subtasks + +1. [x] Add `boto3` to the `dependencies` section in `pyproject.toml`. +2. [x] Add `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `COPERNICUS_S3_ENDPOINT` to `.env.example`. +3. [x] Run `uv lock` (or the equivalent dependency resolution command) to update the lock file if required by the project setup. + +## Requirements & Constraints + +- Must not remove existing dependencies. +- `.env` template should clearly indicate these are for CDSE S3 access. + +## Acceptance Criteria (AC) + +- [x] `pyproject.toml` contains `boto3` in its dependencies. +- [x] `.env.example` contains the three new S3 environment variables. + +## Testing & Validation + +- **Command**: `uv pip compile pyproject.toml` or simply load a python shell and `import boto3` after installing. +- **Success State**: No dependency conflict errors. +- **Manual Verification**: Review the `.env.example` file to ensure the keys are present and clearly commented. + +## Completion Protocol + +1. [ ] All ACs are met. +2. [ ] Tests pass without regressions. +3. [ ] All new code passes the project's formating, linting and type-checking tools with zero errors. +4. [ ] Documentation updated (if applicable). +5. [ ] Commit work: `git commit -m "build: task 1 - add boto3 dependency and update env template"` +6. [ ] Update this document: Mark as COMPLETE. diff --git a/docs/agents/planning/copernicus_s3_access/tasks/TASK-2_s3_utils.md b/docs/agents/planning/copernicus_s3_access/tasks/TASK-2_s3_utils.md new file mode 100644 index 0000000..f7165ec --- /dev/null +++ b/docs/agents/planning/copernicus_s3_access/tasks/TASK-2_s3_utils.md @@ -0,0 +1,45 @@ +# TASK-2: Implement S3 Helper Module (s3_utils.py) + +## Goal + +Create a dedicated utility module to handle S3 client configuration and URL parsing for Copernicus data. + +## Context & References + +- **Source Plan**: `docs/agents/planning/refactor_copernicus_s3_access_PLAN.md` +- **Relevant Specs**: `docs/agents/planning/refactor_copernicus_s3_access_SPEC.md` +- **Existing Code**: `src/geospatial_tools/` + +## Subtasks + +1. [x] Create `src/geospatial_tools/s3_utils.py`. +2. [x] Implement `get_s3_client(endpoint_url: str) -> boto3.client`. This should optionally load credentials from the environment via `python-dotenv` if not automatically handled by boto3. +3. [x] Implement `parse_s3_url(url: str) -> tuple[str, str]` to extract the bucket and key from a CDSE STAC href (or standard `s3://` URI). +4. [x] Create `tests/test_s3_utils.py` and write unit tests for the URL parser and client factory. + +## Requirements & Constraints + +- Strictly use the project's `logging` via `utils.create_logger` for logging any errors or connection details. +- Provide comprehensive type hints (`boto3.client` type, `str`, `tuple`). +- Ensure graceful error handling if parsing fails. + +## Acceptance Criteria (AC) + +- [x] `s3_utils.py` exists and implements `get_s3_client` and `parse_s3_url`. +- [x] `parse_s3_url` correctly handles expected CDSE STAC href formats (e.g., extracting bucket/key from an endpoint URL or a raw S3 path). +- [x] Unit tests pass for both valid and invalid S3 URIs. + +## Testing & Validation + +- **Command**: `pytest tests/test_s3_utils.py` +- **Success State**: All unit tests pass. +- **Manual Verification**: Review `s3_utils.py` to ensure SOLID principles (specifically single responsibility) are followed. + +## Completion Protocol + +1. [ ] All ACs are met. +2. [ ] Tests pass without regressions. +3. [ ] All new code passes the project's formating, linting and type-checking tools with zero errors. +4. [ ] Documentation updated (if applicable). +5. [ ] Commit work: `git commit -m "feat: task 2 - implement s3_utils for client config and url parsing"` +6. [ ] Update this document: Mark as COMPLETE. diff --git a/docs/agents/planning/copernicus_s3_access/tasks/TASK-3_refactor_stac.md b/docs/agents/planning/copernicus_s3_access/tasks/TASK-3_refactor_stac.md new file mode 100644 index 0000000..d99fda7 --- /dev/null +++ b/docs/agents/planning/copernicus_s3_access/tasks/TASK-3_refactor_stac.md @@ -0,0 +1,46 @@ +# TASK-3: Refactor STAC Download Logic (stac.py) + +## Goal + +Refactor the STAC download logic to dispatch to either the new S3 downloader or the existing HTTP downloader based on the asset protocol or catalog source. + +## Context & References + +- **Source Plan**: `docs/agents/planning/refactor_copernicus_s3_access_PLAN.md` +- **Relevant Specs**: `docs/agents/planning/refactor_copernicus_s3_access_SPEC.md` +- **Existing Code**: `src/geospatial_tools/stac.py` + +## Subtasks + +1. [x] Create a generic `download_asset(asset_url: str, destination: Path, method: str = "http")` function (or equivalent class method). +2. [x] Implement the `s3` dispatcher branch using `s3_utils.get_s3_client` and `s3_utils.parse_s3_url`, utilizing `boto3`'s `download_file` method. +3. [x] Refactor the existing `_download_assets` in `StacSearch` to delegate downloads to this new method. +4. [x] Implement logic to automatically detect when a Copernicus Sentinel-2 STAC asset should use the `s3` method instead of `http`. +5. [x] Add unit tests mocking `boto3` to ensure the correct download branch is hit without making actual network requests. + +## Requirements & Constraints + +- Must not break existing HTTP downloads (e.g., for Planetary Computer). +- Use `pathlib.Path` for all file destination handling. +- Ensure proper logging of the download progress or strategy chosen. + +## Acceptance Criteria (AC) + +- [x] `stac.py` successfully dispatches downloads to `boto3` for Copernicus assets. +- [x] `stac.py` falls back to `requests` for standard HTTP assets. +- [x] Unit tests with mocked S3 clients pass. + +## Testing & Validation + +- **Command**: `pytest tests/test_stac.py` (or equivalent file where `stac.py` logic is tested). +- **Success State**: All unit tests pass, confirming the dispatcher routing logic works. +- **Manual Verification**: Review the refactored code to ensure it remains clean and does not become a God Object. + +## Completion Protocol + +1. [ ] All ACs are met. +2. [ ] Tests pass without regressions. +3. [ ] All new code passes the project's formating, linting and type-checking tools with zero errors. +4. [ ] Documentation updated (if applicable). +5. [ ] Commit work: `git commit -m "refactor: task 3 - implement download dispatcher for s3 and http in stac.py"` +6. [ ] Update this document: Mark as COMPLETE. diff --git a/docs/agents/planning/copernicus_s3_access/tasks/TASK-4_integration_testing.md b/docs/agents/planning/copernicus_s3_access/tasks/TASK-4_integration_testing.md new file mode 100644 index 0000000..c8202cb --- /dev/null +++ b/docs/agents/planning/copernicus_s3_access/tasks/TASK-4_integration_testing.md @@ -0,0 +1,44 @@ +# TASK-4: Integration Testing and Validation + +## Goal + +Verify the end-to-end functionality of downloading Copernicus STAC assets via S3 using real credentials against the live API. + +## Context & References + +- **Source Plan**: `docs/agents/planning/refactor_copernicus_s3_access_PLAN.md` +- **Relevant Specs**: `docs/agents/planning/refactor_copernicus_s3_access_SPEC.md` +- **Existing Code**: `tests/test_copernicus.py` + +## Subtasks + +1. [x] Update or create an integration test in `tests/test_copernicus.py` that searches for a STAC item and triggers a download. +2. [x] Ensure this test is marked with `@pytest.mark.online`. +3. [x] Create a local `.env` file (or ensure environment variables are present) with valid CDSE credentials (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `COPERNICUS_S3_ENDPOINT`). +4. [x] Run the integration test and verify that the file is successfully downloaded and saved to the local file system. + +## Requirements & Constraints + +- The test must not run by default without the `--run-online` flag (or standard pytest custom marker configuration). +- Ensure the downloaded file is a valid file (e.g., check file size or existence after download). + +## Acceptance Criteria (AC) + +- [x] Integration test successfully downloads a file from Copernicus S3. +- [x] The test correctly loads credentials via `python-dotenv`. +- [x] Existing Planetary Computer integration tests still pass, proving no regressions. + +## Testing & Validation + +- **Command**: `pytest tests/test_copernicus.py -m online` +- **Success State**: Integration tests pass successfully. +- **Manual Verification**: Inspect the test download directory to confirm the asset was downloaded. + +## Completion Protocol + +1. [ ] All ACs are met. +2. [ ] Tests pass without regressions. +3. [ ] All new code passes the project's formating, linting and type-checking tools with zero errors. +4. [ ] Documentation updated (if applicable). +5. [ ] Commit work: `git commit -m "test: task 4 - add end-to-end integration test for copernicus s3 downloads"` +6. [ ] Update this document: Mark as COMPLETE. diff --git a/docs/agents/planning/relevant_copernicus_links.md b/docs/agents/planning/relevant_copernicus_links.md new file mode 100644 index 0000000..87f28e8 --- /dev/null +++ b/docs/agents/planning/relevant_copernicus_links.md @@ -0,0 +1,27 @@ +# Copernicus End Points and Links + +##STAC API Endpoints & Architecture Changes + +These pages detail the deprecation of the legacy endpoint, the promotion of the v1 endpoint, and the supported STAC extensions (like fields and query). + +[CDSE STAC API Official Documentation](https://documentation.dataspace.copernicus.eu/APIs/STAC.html) + +[Upcoming API Changes & Deprecation Notices (Crucial for 2025/2026)](https://documentation.dataspace.copernicus.eu/APIs/Others/UpcomingChanges.html) + +[CDSE STAC Browser (For visual inspection of the raw JSON metadata and collections)](https://browser.stac.dataspace.copernicus.eu/) + +## Sentinel-2 Band Resolutions & Naming Conventions + +These pages provide the exact ground sample distance (GSD) mappings, physical quantities, and how the Level-2A processor outputs the 10m, 20m, and 60m band variants. + +[Sentinel-2 L2A Band Mapping & Data Format (Sentinel Hub/CDSE)](https://documentation.dataspace.copernicus.eu/APIs/SentinelHub/Data/S2L2A.html) + +[Sentinel-2 Core Mission Documentation (Sensor specs and resolutions)](https://documentation.dataspace.copernicus.eu/Data/Sentinel2.html) + +## Collection Taxonomies & Harmonization + +If you are querying across different providers or building dynamic asset discovery tools, you need to understand how CDSE harmonizes its collection and band names. + +[CDSE Collection Naming Conventions & Harmonization Guidelines](https://documentation.dataspace.copernicus.eu/APIs/openEO/federation/backends/collections.html) + +[List of Main Data Collections in the Ecosystem](https://dataspace.copernicus.eu/data-collections/copernicus-sentinel-missions) diff --git a/docs/agents/planning/stac_current_structure.md b/docs/agents/planning/stac_current_structure.md new file mode 100644 index 0000000..f1dfeec --- /dev/null +++ b/docs/agents/planning/stac_current_structure.md @@ -0,0 +1,67 @@ +# STAC Search and Download Structure + +This document outlines the current structure, tools, and approaches used in `geospatial_tools` to search and download images from the Planetary Computer's STAC catalog, based on `src/geospatial_tools/stac.py` and `src/geospatial_tools/utils.py`. + +## 1. Core Components + +The functionality is primarily encapsulated in the `stac.py` module, which leverages `pystac` and `pystac_client` for STAC interactions. + +### 1.1. Catalog Management + +- **`create_planetary_computer_catalog`**: Creates a `pystac_client.Client` specifically for the Microsoft Planetary Computer API (`https://planetarycomputer.microsoft.com/api/stac/v1`). It includes retry logic and uses `planetary_computer.sign_inplace` to sign assets for access. +- **`catalog_generator`**: A factory function to retrieve a STAC client based on a catalog name. Currently, only "planetary_computer" is supported. +- **`list_available_catalogs`**: Returns the list of supported catalogs (currently just `PLANETARY_COMPUTER`). + +### 1.2. Data Structures + +- **`AssetSubItem`**: Represents a single downloaded file (e.g., a specific band of a satellite image). It holds the reference to the original STAC Item, the item ID, the band name, and the local filename/path. +- **`Asset`**: Represents a logical asset, which can contain multiple `AssetSubItem`s (bands). It manages: + - `asset_id`: The ID of the STAC Item. + - `bands`: List of bands associated with this asset. + - `list`: A list of `AssetSubItem` objects. + - `merged_asset_path`: Path to the merged raster file (if merged). + - `reprojected_asset_path`: Path to the reprojected raster file (if reprojected). + - **Methods**: + - `add_asset_item`: Adds a sub-item. + - `merge_asset`: Merges the downloaded bands into a single raster using `geospatial_tools.raster.merge_raster_bands`. + - `reproject_merged_asset`: Reprojects the merged asset using `geospatial_tools.raster.reproject_raster`. + - Cleanup methods: `delete_asset_sub_items`, `delete_merged_asset`, `delete_reprojected_asset`. + +### 1.3. Search Logic (`StacSearch` Class) + +The `StacSearch` class is the main entry point for searching and downloading data. + +- **Initialization**: Takes a `catalog_name` and initializes the corresponding `pystac_client.Client`. +- **Search Methods**: + - **`search`**: A wrapper around `client.search()`. It accepts standard STAC search parameters (date_range, bbox, collections, query, etc.) and handles retries. + - **`search_for_date_ranges`**: Iterates over a list of date ranges and performs a search for each, aggregating the results. This is useful for discontinuous time periods. +- **Filtering and Sorting**: + - **`sort_results_by_cloud_coverage`**: Sorts the search results based on the `eo:cloud_cover` property (ascending). + - **`filter_no_data`**: Filters results based on a maximum threshold for a specific property (often used for nodata values, though the implementation checks `item.properties[property_name] < max_no_data_value`). +- **Download Methods**: + - **`download_search_results`**: Downloads assets for *all* search results. + - **`download_sorted_by_cloud_cover_search_results`**: Sorts results by cloud cover (if not already sorted) and downloads the top `first_x_num_of_items` (or all if not specified). + - **`download_best_cloud_cover_result`**: Downloads only the single result with the lowest cloud cover. + - **`_download_assets`**: Internal method that iterates through requested bands, checks availability in the STAC Item, and downloads them using `geospatial_tools.utils.download_url`. + +## 2. Utilities (`utils.py`) + +Helper functions used by the STAC module: + +- **`download_url`**: Handles the actual HTTP GET request to download a file from a URL to a local path. It includes checks for existing files to avoid re-downloading. +- **`create_date_range_for_specific_period`**: Generates a list of ISO 8601 date range strings (e.g., "2020-03-01T00:00:00Z/2020-05-31T23:59:59Z") given start/end years and start/end months. This is used in conjunction with `search_for_date_ranges`. +- **`create_logger`**: Standard logging setup. + +## 3. Workflow Summary + +1. **Initialize**: Create a `StacSearch` object with "planetary_computer". +2. **Search**: Call `search()` or `search_for_date_ranges()` with criteria (collections, bbox, datetime, etc.). +3. **Process Results**: + - Optionally sort by cloud cover (`sort_results_by_cloud_coverage`). + - Optionally filter by other properties (`filter_no_data`). +4. **Download**: Call one of the download methods (`download_search_results`, `download_best_cloud_cover_result`, etc.) specifying the desired bands and a base directory. + - This triggers `_download_assets`, which uses `utils.download_url`. + - Returns `Asset` objects containing `AssetSubItem`s. +5. **Post-Processing (Optional)**: + - Use `Asset.merge_asset()` to combine bands into a single file. + - Use `Asset.reproject_merged_asset()` to reproject the merged file. diff --git a/environment.yml b/environment.yml deleted file mode 100644 index ed89a69..0000000 --- a/environment.yml +++ /dev/null @@ -1,6 +0,0 @@ -name: geospatial-tools -channels: - - conda-forge -dependencies: - - python=3.11 - diff --git a/notebooks/copernicus_sentinel2_exploration.ipynb b/notebooks/copernicus_sentinel2_exploration.ipynb new file mode 100644 index 0000000..79b3b1f --- /dev/null +++ b/notebooks/copernicus_sentinel2_exploration.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# Copernicus Sentinel-2 Exploration\n", + "\n", + "This notebook showcases how to use the `geospatial_tools` library to search for and download Sentinel-2 products from the **Copernicus Data Space Ecosystem (CDSE)** STAC catalog. \n", + "\n", + "We will demonstrate:\n", + "1. Initializing the `StacSearch` client for Copernicus.\n", + "2. Performing a spatial and temporal search.\n", + "3. Inspecting STAC items and assets.\n", + "4. Downloading data using the **S3 Protocol** for optimized access.\n", + "\n", + "## Prerequisites\n", + "1. Create a Copernicus Account here : []()\n", + "2. Register an S3 access key here : [https://documentation.dataspace.copernicus.eu/APIs/S3.html](https://documentation.dataspace.copernicus.eu/APIs/S3.html)\n", + "3. Create your own .env file from the .env.example file with your copernicus credentials" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "imports", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-13T18:50:52.982123262Z", + "start_time": "2026-04-13T18:50:50.873548224Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data Directory: /home/francispelletier/projects/geospatial_tools/data\n" + ] + } + ], + "source": [ + "import os\n", + "from pathlib import Path\n", + "import leafmap\n", + "from geospatial_tools.stac import StacSearch, COPERNICUS\n", + "from geospatial_tools.copernicus.sentinel_2 import CopernicusS2Collection, CopernicusS2Band\n", + "from geospatial_tools import DATA_DIR\n", + "\n", + "# The .env file is automatically loaded via geospatial_tools.__init__\n", + "print(f\"Data Directory: {DATA_DIR}\")" + ] + }, + { + "cell_type": "markdown", + "id": "search_config", + "metadata": {}, + "source": [ + "## 1. Define Search Parameters\n", + "\n", + "We'll search for Sentinel-2 Level-2A data over Rome, Italy, for a specific period in 2024." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "search_params", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-13T18:50:53.047974505Z", + "start_time": "2026-04-13T18:50:52.984235300Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Searching in [12.4, 41.8, 12.5, 41.9] for period 2024-07-01/2024-07-31...\n" + ] + } + ], + "source": [ + "# Bounding box for Rome [min_lon, min_lat, max_lon, max_lat]\n", + "bbox = [12.4, 41.8, 12.5, 41.9]\n", + "date_range = \"2024-07-01/2024-07-31\"\n", + "collections = [CopernicusS2Collection.L2A]\n", + "\n", + "print(f\"Searching in {bbox} for period {date_range}...\")" + ] + }, + { + "cell_type": "markdown", + "id": "perform_search", + "metadata": {}, + "source": [ + "## 2. Initialize StacSearch and Query\n", + "\n", + "Initializing `StacSearch` with `catalog_name=COPERNICUS` automatically configures the S3 client using the credentials found in your `.env` file." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "search_exec", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-13T18:50:54.769506010Z", + "start_time": "2026-04-13T18:50:53.058684946Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2026-04-13 14:53:30] INFO [MainThread][geospatial_tools.s3_utils] Creating S3 client with endpoint: [https://eodata.dataspace.copernicus.eu]\n", + "[2026-04-13 14:53:30] INFO [MainThread][geospatial_tools.stac] Initiating STAC API search\n", + "Found 1 items.\n" + ] + } + ], + "source": [ + "stac_search = StacSearch(catalog_name=COPERNICUS)\n", + "\n", + "results = stac_search.search(\n", + " bbox=bbox, \n", + " date_range=date_range, \n", + " collections=collections,\n", + " max_items=1\n", + ")\n", + "\n", + "print(f\"Found {len(results)} items.\")" + ] + }, + { + "cell_type": "markdown", + "id": "inspect_results", + "metadata": {}, + "source": [ + "## 3. Inspect Results\n", + "\n", + "Let's look at the first item and its available assets." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "inspect_exec", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-13T18:50:54.875089804Z", + "start_time": "2026-04-13T18:50:54.817556767Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Item ID: S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034\n", + "Cloud Cover: 0.0% \n", + "First 10 assets: ['AOT_10m', 'AOT_20m', 'AOT_60m', 'B01_20m', 'B01_60m', 'B02_10m', 'B02_20m', 'B02_60m', 'B03_10m', 'B03_20m']\n" + ] + } + ], + "source": [ + "if results:\n", + " item = results[0]\n", + " print(f\"Item ID: {item.id}\")\n", + " print(f\"Cloud Cover: {item.properties.get('eo:cloud_cover')}% \")\n", + " \n", + " # List a few bands/assets\n", + " available_assets = list(item.assets.keys())\n", + " print(f\"First 10 assets: {available_assets[:10]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "visualize_search", + "metadata": {}, + "source": [ + "We can visualize the search area and the footprint of the first result using `leafmap`." + ] + }, + { + "cell_type": "markdown", + "id": "download_assets", + "metadata": {}, + "source": [ + "## 4. Download Assets using S3\n", + "\n", + "Now we download the True Color Image (TCI) and the NIR band (B08) using the optimized S3 protocol." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "download_exec", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-13T18:51:05.058278079Z", + "start_time": "2026-04-13T18:50:54.884830158Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading bands [TCI_10m, B08_10m] to /home/francispelletier/projects/geospatial_tools/data/sentinel-2/copernicus_exploration via S3...\n", + "[2026-04-13 14:53:31] INFO [MainThread][geospatial_tools.stac] Downloading [S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034] ...\n", + "[2026-04-13 14:53:31] INFO [MainThread][geospatial_tools.stac] Retrieving Copernicus credentials...\n", + "[2026-04-13 14:53:31] INFO [MainThread][geospatial_tools.stac] Successfully retrieved Copernicus credentials.\n", + "[2026-04-13 14:53:32] INFO [MainThread][geospatial_tools.stac] Successfully obtained Copernicus access token.\n", + "[2026-04-13 14:53:32] INFO [MainThread][geospatial_tools.stac] Downloading TCI_10m from s3://eodata/Sentinel-2/MSI/L2A/2024/07/28/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034.SAFE/GRANULE/L2A_T33TTG_A038615_20240728T095731/IMG_DATA/R10m/T33TTG_20240728T095549_TCI_10m.jp2 using method [s3]\n", + "[2026-04-13 14:53:32] INFO [MainThread][geospatial_tools.stac] Downloading from S3: bucket=[eodata], key=[Sentinel-2/MSI/L2A/2024/07/28/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034.SAFE/GRANULE/L2A_T33TTG_A038615_20240728T095731/IMG_DATA/R10m/T33TTG_20240728T095549_TCI_10m.jp2] to [/home/francispelletier/projects/geospatial_tools/data/sentinel-2/copernicus_exploration/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034_TCI_10m.tif]\n", + "[2026-04-13 14:53:40] INFO [MainThread][geospatial_tools.stac] Downloading B08_10m from s3://eodata/Sentinel-2/MSI/L2A/2024/07/28/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034.SAFE/GRANULE/L2A_T33TTG_A038615_20240728T095731/IMG_DATA/R10m/T33TTG_20240728T095549_B08_10m.jp2 using method [s3]\n", + "[2026-04-13 14:53:40] INFO [MainThread][geospatial_tools.stac] Downloading from S3: bucket=[eodata], key=[Sentinel-2/MSI/L2A/2024/07/28/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034.SAFE/GRANULE/L2A_T33TTG_A038615_20240728T095731/IMG_DATA/R10m/T33TTG_20240728T095549_B08_10m.jp2] to [/home/francispelletier/projects/geospatial_tools/data/sentinel-2/copernicus_exploration/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034_B08_10m.tif]\n", + "[2026-04-13 14:53:46] INFO [MainThread][geospatial_tools.stac] Asset list for asset [S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034] :\n", + "\t['ID: [S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034], Band: [TCI_10m], filename: [/home/francispelletier/projects/geospatial_tools/data/sentinel-2/copernicus_exploration/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034_TCI_10m.tif]', 'ID: [S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034], Band: [B08_10m], filename: [/home/francispelletier/projects/geospatial_tools/data/sentinel-2/copernicus_exploration/S2B_MSIL2A_20240728T095549_N0511_R122_T33TTG_20240728T114034_B08_10m.tif]']\n" + ] + } + ], + "source": [ + "download_dir = DATA_DIR / \"sentinel-2\" / \"copernicus_exploration\"\n", + "download_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "bands = [\n", + " CopernicusS2Band.TCI,\n", + " CopernicusS2Band.B08\n", + "]\n", + "\n", + "print(f\"Downloading bands {bands} to {download_dir} via S3...\")\n", + "\n", + "# download_search_results will handle the dispatcher logic automatically\n", + "downloaded_assets = stac_search.download_search_results(\n", + " bands=bands, \n", + " base_directory=download_dir\n", + ")\n", + "\n", + "for asset in downloaded_assets:\n", + " asset.show_asset_items()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "17afc08e6f4682c6", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-13T18:52:00.383906280Z", + "start_time": "2026-04-13T18:52:00.158839725Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eb8f6e50bf5945648dd685403ba2014a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map(center=[41.9185435, 12.0389935], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title…" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tci_asset = downloaded_assets[0][CopernicusS2Band.TCI]\n", + "m = leafmap.Map()\n", + "m.add_raster(source=str(tci_asset.filename))\n", + "m" + ] + }, + { + "cell_type": "markdown", + "id": "conclusion", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "By using the `StacSearch` dispatcher with the `COPERNICUS` catalog, you can seamlessly switch from HTTP downloads to direct S3 bucket access, which is significantly faster and more reliable for large-scale data retrieval." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/planetary_computer_sentinel2_exploration.ipynb b/notebooks/planetary_computer_sentinel2_exploration.ipynb index da695bd..5f73455 100644 --- a/notebooks/planetary_computer_sentinel2_exploration.ipynb +++ b/notebooks/planetary_computer_sentinel2_exploration.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "code", + "execution_count": 1, "id": "initial_id", "metadata": { "ExecuteTime": { @@ -9,6 +10,7 @@ "start_time": "2026-01-29T21:49:35.493264346Z" } }, + "outputs": [], "source": [ "import leafmap\n", "import geopandas as gpd\n", @@ -18,9 +20,7 @@ "from geospatial_tools.stac import Asset\n", "from geospatial_tools.utils import get_yaml_config, download_url, unzip_file\n", "from geospatial_tools.vector import create_vector_grid_parallel, to_geopackage, select_polygons_by_location" - ], - "outputs": [], - "execution_count": 1 + ] }, { "cell_type": "markdown", @@ -51,6 +51,7 @@ }, { "cell_type": "code", + "execution_count": 2, "id": "687ac922509bf0e4", "metadata": { "ExecuteTime": { @@ -58,16 +59,6 @@ "start_time": "2026-01-29T21:49:37.848648030Z" } }, - "source": [ - "file_configs = get_yaml_config(\"data_file_links\")\n", - "raw_usa_polygon_path = file_configs[\"united_states_polygon\"][\"url\"]\n", - "raw_s2_tiling_grid_path = file_configs[\"sentinel_2_tiling_grid\"][\"url\"]\n", - "download_list = {\"raw_usa_polygon\": raw_usa_polygon_path, \"raw_s2_tiling_grid\": raw_s2_tiling_grid_path}\n", - "file_list = [download_url(url=url, filename=f\"{DATA_DIR}/{key}.zip\") for key, url in download_list.items()]\n", - "\n", - "file_list\n", - "\n" - ], "outputs": [ { "data": { @@ -81,10 +72,20 @@ "output_type": "execute_result" } ], - "execution_count": 2 + "source": [ + "file_configs = get_yaml_config(\"data_file_links\")\n", + "raw_usa_polygon_path = file_configs[\"united_states_polygon\"][\"url\"]\n", + "raw_s2_tiling_grid_path = file_configs[\"sentinel_2_tiling_grid\"][\"url\"]\n", + "download_list = {\"raw_usa_polygon\": raw_usa_polygon_path, \"raw_s2_tiling_grid\": raw_s2_tiling_grid_path}\n", + "file_list = [download_url(url=url, filename=f\"{DATA_DIR}/{key}.zip\") for key, url in download_list.items()]\n", + "\n", + "file_list\n", + "\n" + ] }, { "cell_type": "code", + "execution_count": 3, "id": "26a5535d1d05fbbe", "metadata": { "ExecuteTime": { @@ -92,9 +93,6 @@ "start_time": "2026-01-29T21:49:37.927928176Z" } }, - "source": [ - "[unzip_file(zip_path=f, extract_to=DATA_DIR) for f in file_list]" - ], "outputs": [ { "data": { @@ -114,7 +112,9 @@ "output_type": "execute_result" } ], - "execution_count": 3 + "source": [ + "[unzip_file(zip_path=f, extract_to=DATA_DIR) for f in file_list]" + ] }, { "cell_type": "markdown", @@ -139,6 +139,7 @@ }, { "cell_type": "code", + "execution_count": 4, "id": "ff487d19985d9368", "metadata": { "ExecuteTime": { @@ -146,17 +147,17 @@ "start_time": "2026-01-29T21:49:38.584766210Z" } }, + "outputs": [], "source": [ "USA_POLYGON_FILE = DATA_DIR / \"usa_polygon_5070.gpkg\"\n", "S2_USA_GRID_FILE = DATA_DIR / \"s2_grid_usa_polygon_5070.gpkg\"\n", "usa_polygon = gpd.read_file(USA_POLYGON_FILE)\n", "s2_grid = gpd.read_file(S2_USA_GRID_FILE)" - ], - "outputs": [], - "execution_count": 4 + ] }, { "cell_type": "code", + "execution_count": 5, "id": "2767e8432a15bb65", "metadata": { "ExecuteTime": { @@ -164,19 +165,9 @@ "start_time": "2026-01-29T21:49:38.642323833Z" } }, - "source": [ - "usa_polygon" - ], "outputs": [ { "data": { - "text/plain": [ - " AFFGEOID GEOID NAME \\\n", - "0 0100000US US United States \n", - "\n", - " geometry \n", - "0 MULTIPOLYGON (((-2116048.733 3142966.552, -211... " - ], "text/html": [ "
\n", "