From 07e41606460c0945b5e06dd3a35de61c386434c8 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:04:46 +0800 Subject: [PATCH 01/10] support VARIANT for pypaimon --- docs/content/pypaimon/python-api.md | 115 +- .../java/org/apache/paimon/JavaPyE2ETest.java | 48 + paimon-python/dev/run_mixed_tests.sh | 67 +- .../pypaimon/data/generic_variant.py | 1110 +++++++++++++++++ paimon-python/pypaimon/schema/data_types.py | 38 + .../pypaimon/table/row/generic_row.py | 32 +- .../tests/e2e/java_py_read_write_test.py | 131 ++ .../pypaimon/tests/generic_variant_test.py | 405 ++++++ paimon-python/pypaimon/tests/variant_test.py | 409 ++++++ .../pypaimon/write/writer/data_blob_writer.py | 9 + .../pypaimon/write/writer/data_writer.py | 10 +- 11 files changed, 2354 insertions(+), 20 deletions(-) create mode 100644 paimon-python/pypaimon/data/generic_variant.py create mode 100644 paimon-python/pypaimon/tests/generic_variant_test.py create mode 100644 paimon-python/pypaimon/tests/variant_test.py diff --git a/docs/content/pypaimon/python-api.md b/docs/content/pypaimon/python-api.md index 197a018ef1f7..109d34ea21cc 100644 --- a/docs/content/pypaimon/python-api.md +++ b/docs/content/pypaimon/python-api.md @@ -687,22 +687,105 @@ Row kind values: ## Data Types -| Python Native Type | PyArrow Type | Paimon Type | -|:--------------------|:-------------------------------------------------|:----------------------------------| -| `int` | `pyarrow.int8()` | `TINYINT` | -| `int` | `pyarrow.int16()` | `SMALLINT` | -| `int` | `pyarrow.int32()` | `INT` | -| `int` | `pyarrow.int64()` | `BIGINT` | -| `float` | `pyarrow.float32()` | `FLOAT` | -| `float` | `pyarrow.float64()` | `DOUBLE` | -| `bool` | `pyarrow.bool_()` | `BOOLEAN` | -| `str` | `pyarrow.string()` | `STRING`, `CHAR(n)`, `VARCHAR(n)` | -| `bytes` | `pyarrow.binary()` | `BYTES`, `VARBINARY(n)` | -| `bytes` | `pyarrow.binary(length)` | `BINARY(length)` | -| `decimal.Decimal` | `pyarrow.decimal128(precision, scale)` | `DECIMAL(precision, scale)` | -| `datetime.datetime` | `pyarrow.timestamp(unit, tz=None)` | `TIMESTAMP(p)` | -| `datetime.date` | `pyarrow.date32()` | `DATE` | -| `datetime.time` | `pyarrow.time32(unit)` or `pyarrow.time64(unit)` | `TIME(p)` | +### Scalar Types + +| Python Native Type | PyArrow Type | Paimon Type | +|:--------------------|:---------------------------------------|:----------------------------------| +| `int` | `pyarrow.int8()` | `TINYINT` | +| `int` | `pyarrow.int16()` | `SMALLINT` | +| `int` | `pyarrow.int32()` | `INT` | +| `int` | `pyarrow.int64()` | `BIGINT` | +| `float` | `pyarrow.float32()` | `FLOAT` | +| `float` | `pyarrow.float64()` | `DOUBLE` | +| `bool` | `pyarrow.bool_()` | `BOOLEAN` | +| `str` | `pyarrow.string()` | `STRING`, `CHAR(n)`, `VARCHAR(n)` | +| `bytes` | `pyarrow.binary()` | `BYTES`, `VARBINARY(n)` | +| `bytes` | `pyarrow.binary(length)` | `BINARY(length)` | +| `bytes` | `pyarrow.large_binary()` | `BLOB` | +| `decimal.Decimal` | `pyarrow.decimal128(precision, scale)` | `DECIMAL(precision, scale)` | +| `datetime.datetime` | `pyarrow.timestamp('us', tz=None)` | `TIMESTAMP(p)` (p=4..6) | +| `datetime.datetime` | `pyarrow.timestamp('ms', tz=None)` | `TIMESTAMP(p)` (p=1..3) | +| `datetime.datetime` | `pyarrow.timestamp('s', tz=None)` | `TIMESTAMP(p)` (p=0) | +| `datetime.datetime` | `pyarrow.timestamp('ns', tz=None)` | `TIMESTAMP(p)` (p=7..9) | +| `datetime.datetime` | `pyarrow.timestamp('us', tz='UTC')` | `TIMESTAMP_LTZ(p)` (p=4..6) | +| `datetime.date` | `pyarrow.date32()` | `DATE` | +| `datetime.time` | `pyarrow.time32('ms')` | `TIME(p)` | + +### Complex Types + +| Python Native Type | PyArrow Type | Paimon Type | +|:-------------------|:--------------------------------------|:-----------------------| +| `list` | `pyarrow.list_(element_type)` | `ARRAY` | +| `dict` | `pyarrow.map_(key_type, value_type)` | `MAP` | +| `dict` | `pyarrow.struct([field, ...])` | `ROW` | + +### VARIANT Type + +`VARIANT` stores semi-structured, schema-flexible data (JSON objects, arrays, and primitives) +in the [Parquet Variant binary encoding](https://github.com/apache/parquet-format/blob/master/VariantEncoding.md). + +pypaimon provides `GenericVariant` for encoding, decoding, and path extraction: + +```python +from pypaimon.data.generic_variant import GenericVariant +``` + +**Reading a VARIANT column:** + +```python +read_builder = table.new_read_builder() +result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits()) + +for row in result.column("payload").to_pylist(): + if row is not None: + gv = GenericVariant.from_dict(row) # wrap raw bytes + print(gv.to_python()) # decode to Python object + print(gv.variant_get("$.city", "string")) # path extraction +``` + +**Writing a VARIANT column:** + +```python +import pyarrow as pa +from pypaimon.data.generic_variant import GenericVariant + +gv1 = GenericVariant.from_json('{"city": "Beijing", "age": 30}') +gv2 = GenericVariant.from_json('[1, 2, 3]') +gv3 = GenericVariant.from_json('null') + +data = pa.table({ + "id": pa.array([1, 2, 3], type=pa.int32()), + "payload": GenericVariant.to_arrow_array([gv1, gv2, gv3]), +}) + +write_builder = table.new_batch_write_builder() +table_write = write_builder.new_write() +table_commit = write_builder.new_commit() +table_write.write_arrow(data) +table_commit.commit(table_write.prepare_commit()) +table_write.close() +table_commit.close() +``` + +**`GenericVariant` API:** + +| Method | Description | +|:-------|:------------| +| `GenericVariant.from_json(json_str)` | Build from a JSON string | +| `GenericVariant.from_python(obj)` | Build from a Python object (`dict`, `list`, `int`, `str`, …) | +| `GenericVariant.from_dict({"value": b"...", "metadata": b"..."})` | Wrap raw bytes from an Arrow VARIANT struct row | +| `GenericVariant.to_arrow_array([gv1, gv2, None, ...])` | Convert a list of `GenericVariant` (or `None`) to a `pa.StructArray` for writing | +| `gv.to_python()` | Decode to native Python (`dict`, `list`, `int`, `str`, `None`, …) | +| `gv.to_json()` | Decode to a JSON string | +| `gv.variant_get(path, cast_type=None)` | Extract a value by JSONPath (e.g. `"$.address.city"`, `"$.tags[0]"`); optional `cast_type`: `"string"`, `"int"`, `"long"`, `"double"`, `"boolean"`, `"decimal"` | +| `gv.get_type()` | Return the `Type` enum of the root value | + +**Limitations:** + +- `VARIANT` is only supported with Parquet file format. Writing to ORC or Avro raises `NotImplementedError`. +- `VARIANT` cannot be used as a primary key or partition key. +- Shredded VARIANT files (written by Paimon Java with `typed_value` sub-fields) are readable + via the raw `from_dict` path, but the extra fields are not automatically interpreted. ## Predicate diff --git a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java index c09bf3466384..66ad1d538d33 100644 --- a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java +++ b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java @@ -28,6 +28,7 @@ import org.apache.paimon.data.DataFormatTestUtil; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.variant.GenericVariant; import org.apache.paimon.disk.IOManager; import org.apache.paimon.fs.FileIOFinder; import org.apache.paimon.fs.Path; @@ -941,6 +942,53 @@ protected GenericRow createRow3ColsWithKind(RowKind rowKind, Object... values) { return GenericRow.ofKind(rowKind, values[0], values[1], values[2]); } + /** Write a VARIANT column table for Python interoperability test. */ + @Test + @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") + public void testVariantWrite() throws Exception { + Identifier identifier = identifier("variant_test"); + catalog.dropTable(identifier, true); + Schema schema = + Schema.newBuilder() + .column("id", DataTypes.INT()) + .column("name", DataTypes.STRING()) + .column("payload", DataTypes.VARIANT()) + .option("bucket", "-1") + .build(); + catalog.createTable(identifier, schema, false); + + FileStoreTable table = (FileStoreTable) catalog.getTable(identifier); + BatchWriteBuilder writeBuilder = table.newBatchWriteBuilder(); + try (BatchTableWrite write = writeBuilder.newWrite(); + BatchTableCommit commit = writeBuilder.newCommit()) { + write.write( + GenericRow.of( + 1, + BinaryString.fromString("Alice"), + GenericVariant.fromJson("{\"age\":30,\"city\":\"Beijing\"}"))); + write.write( + GenericRow.of( + 2, + BinaryString.fromString("Bob"), + GenericVariant.fromJson("{\"age\":25,\"city\":\"Shanghai\"}"))); + write.write( + GenericRow.of( + 3, + BinaryString.fromString("Carol"), + GenericVariant.fromJson("[1,2,3]"))); + commit.commit(write.prepareCommit()); + } + + // Verify Java can read back what it wrote + FileStoreTable readTable = (FileStoreTable) catalog.getTable(identifier); + List splits = new ArrayList<>(readTable.newSnapshotReader().read().dataSplits()); + TableRead read = readTable.newRead(); + List res = + getResult(read, splits, row -> internalRowToString(row, readTable.rowType())); + assertThat(res).hasSize(3); + LOG.info("testVariantWrite: wrote and read back {} VARIANT rows", res.size()); + } + /** Step 1: Write 5 base files for compact conflict test. */ @Test @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") diff --git a/paimon-python/dev/run_mixed_tests.sh b/paimon-python/dev/run_mixed_tests.sh index 077b5af27664..f277ca79e8c7 100755 --- a/paimon-python/dev/run_mixed_tests.sh +++ b/paimon-python/dev/run_mixed_tests.sh @@ -339,6 +339,43 @@ run_blob_alter_compact_test() { fi } +run_variant_test() { + echo -e "${YELLOW}=== Running VARIANT Type Test (Java Write, Python Read) ===${NC}" + + cd "$PROJECT_ROOT" + + echo "Running Maven test for JavaPyE2ETest.testVariantWrite..." + if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testVariantWrite -pl paimon-core -q -Drun.e2e.tests=true; then + echo -e "${GREEN}✓ Java VARIANT write test completed successfully${NC}" + else + echo -e "${RED}✗ Java VARIANT write test failed${NC}" + return 1 + fi + cd "$PAIMON_PYTHON_DIR" + echo "Running Python test for JavaPyReadWriteTest.test_read_variant_table..." + if python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_read_variant_table -v; then + echo -e "${GREEN}✓ Python VARIANT read test completed successfully${NC}" + return 0 + else + echo -e "${RED}✗ Python VARIANT read test failed${NC}" + return 1 + fi +} + +run_py_variant_write_test() { + echo -e "${YELLOW}=== Running Python VARIANT Write+Read Test ===${NC}" + + cd "$PAIMON_PYTHON_DIR" + echo "Running Python test for JavaPyReadWriteTest.test_py_write_read_variant_table..." + if python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_py_write_read_variant_table -v; then + echo -e "${GREEN}✓ Python VARIANT write+read test completed successfully${NC}" + return 0 + else + echo -e "${RED}✗ Python VARIANT write+read test failed${NC}" + return 1 + fi +} + # Main execution main() { local java_write_result=0 @@ -352,6 +389,8 @@ main() { local lumina_vector_result=0 local compact_conflict_result=0 local blob_alter_compact_result=0 + local variant_result=0 + local py_variant_write_result=0 # Detect Python version PYTHON_VERSION=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "unknown") @@ -448,6 +487,20 @@ main() { echo "" + # Run VARIANT type test (Java write, Python read) + if ! run_variant_test; then + variant_result=1 + fi + + echo "" + + # Run Python VARIANT write+read test (Python only, no Java needed) + if ! run_py_variant_write_test; then + py_variant_write_result=1 + fi + + echo "" + echo -e "${YELLOW}=== Test Results Summary ===${NC}" if [[ $java_write_result -eq 0 ]]; then @@ -516,12 +569,24 @@ main() { echo -e "${RED}✗ Blob Alter+Compact Test (Java Write+Alter+Compact, Python Read): FAILED${NC}" fi + if [[ $variant_result -eq 0 ]]; then + echo -e "${GREEN}✓ VARIANT Type Test (Java Write, Python Read): PASSED${NC}" + else + echo -e "${RED}✗ VARIANT Type Test (Java Write, Python Read): FAILED${NC}" + fi + + if [[ $py_variant_write_result -eq 0 ]]; then + echo -e "${GREEN}✓ Python VARIANT Write+Read Test: PASSED${NC}" + else + echo -e "${RED}✗ Python VARIANT Write+Read Test: FAILED${NC}" + fi + echo "" # Clean up warehouse directory after all tests cleanup_warehouse - if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 && $tantivy_fulltext_result -eq 0 && $lumina_vector_result -eq 0 && $compact_conflict_result -eq 0 && $blob_alter_compact_result -eq 0 ]]; then + if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 && $tantivy_fulltext_result -eq 0 && $lumina_vector_result -eq 0 && $compact_conflict_result -eq 0 && $blob_alter_compact_result -eq 0 && $variant_result -eq 0 && $py_variant_write_result -eq 0 ]]; then echo -e "${GREEN}🎉 All tests passed! Java-Python interoperability verified.${NC}" return 0 else diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py new file mode 100644 index 000000000000..0f269842c605 --- /dev/null +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -0,0 +1,1110 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +################################################################################ + +"""Python implementation of Paimon GenericVariant. + +Mirrors the binary encoding defined by Paimon Java's GenericVariant / +GenericVariantUtil / GenericVariantBuilder, which itself is based on the +Parquet Variant spec (https://github.com/apache/parquet-format/blob/main/VariantEncoding.md). + +Primary entry points: + GenericVariant.from_json(json_str) – build from a JSON string + GenericVariant(value, metadata) – wrap raw bytes from a Parquet/Paimon VARIANT column + v.to_json() – decode back to a JSON string + v.variant_get('$.field', 'int') – JSONPath extraction with optional cast + v.to_python() – decode to native Python objects +""" + +import datetime +import decimal as _decimal +import enum +import json as _json +import re +import struct +import uuid as _uuid + +# --------------------------------------------------------------------------- +# Constants (matching GenericVariantUtil.java) +# --------------------------------------------------------------------------- + +_PRIMITIVE = 0 +_SHORT_STR = 1 +_OBJECT = 2 +_ARRAY = 3 + +_NULL = 0 +_TRUE = 1 +_FALSE = 2 +_INT1 = 3 +_INT2 = 4 +_INT4 = 5 +_INT8 = 6 +_DOUBLE = 7 +_DECIMAL4 = 8 +_DECIMAL8 = 9 +_DECIMAL16 = 10 +_DATE = 11 +_TIMESTAMP = 12 +_TIMESTAMP_NTZ = 13 +_FLOAT = 14 +_BINARY = 15 +_LONG_STR = 16 +_UUID = 20 + +_VERSION = 1 +_VERSION_MASK = 0x0F +_BINARY_SEARCH_THRESHOLD = 32 +_SIZE_LIMIT = 128 * 1024 * 1024 +_MAX_SHORT_STR_SIZE = 0x3F # 63 +_U8_MAX = 255 +_U16_MAX = 65535 +_U24_MAX = 16777215 +_U32_SIZE = 4 +_MAX_DECIMAL4_PRECISION = 9 +_MAX_DECIMAL8_PRECISION = 18 +_MAX_DECIMAL16_PRECISION = 38 + +# Epoch for date/timestamp conversions +_EPOCH_DATE = datetime.date(1970, 1, 1) +_EPOCH_DT_UTC = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) +_EPOCH_DT_NTZ = datetime.datetime(1970, 1, 1) + + +class Type(enum.Enum): + """High-level variant value types (many-to-one from wire types).""" + OBJECT = 'OBJECT' + ARRAY = 'ARRAY' + NULL = 'NULL' + BOOLEAN = 'BOOLEAN' + LONG = 'LONG' + STRING = 'STRING' + DOUBLE = 'DOUBLE' + DECIMAL = 'DECIMAL' + DATE = 'DATE' + TIMESTAMP = 'TIMESTAMP' + TIMESTAMP_NTZ = 'TIMESTAMP_NTZ' + FLOAT = 'FLOAT' + BINARY = 'BINARY' + UUID = 'UUID' + + +# --------------------------------------------------------------------------- +# Low-level binary utilities +# --------------------------------------------------------------------------- + +def _read_unsigned(data, pos, n): + """Read a little-endian unsigned integer of n bytes.""" + return int.from_bytes(data[pos:pos + n], 'little', signed=False) + + +def _read_signed(data, pos, n): + """Read a little-endian signed integer of n bytes.""" + return int.from_bytes(data[pos:pos + n], 'little', signed=True) + + +def _write_le(buf, pos, value, n): + """Write value as n-byte little-endian into bytearray buf at pos.""" + for i in range(n): + buf[pos + i] = (value >> (8 * i)) & 0xFF + + +def _get_int_size(value): + """Return the minimum number of bytes (1-4) needed for an unsigned int.""" + if value <= _U8_MAX: + return 1 + if value <= _U16_MAX: + return 2 + if value <= _U24_MAX: + return 3 + return 4 + + +def _primitive_header(type_id): + return (type_id << 2) | _PRIMITIVE + + +def _short_str_header(size): + return (size << 2) | _SHORT_STR + + +def _object_header(large_size, id_size, offset_size): + return ( + ((1 if large_size else 0) << 6) + | ((id_size - 1) << 4) + | ((offset_size - 1) << 2) + | _OBJECT + ) + + +def _array_header(large_size, offset_size): + return ( + ((1 if large_size else 0) << 4) + | ((offset_size - 1) << 2) + | _ARRAY + ) + + +def _get_type(value, pos): + b = value[pos] + basic_type = b & 0x3 + type_info = (b >> 2) & 0x3F + if basic_type == _SHORT_STR: + return Type.STRING + if basic_type == _OBJECT: + return Type.OBJECT + if basic_type == _ARRAY: + return Type.ARRAY + # PRIMITIVE + _MAP = { + _NULL: Type.NULL, + _TRUE: Type.BOOLEAN, _FALSE: Type.BOOLEAN, + _INT1: Type.LONG, _INT2: Type.LONG, _INT4: Type.LONG, _INT8: Type.LONG, + _DOUBLE: Type.DOUBLE, + _DECIMAL4: Type.DECIMAL, _DECIMAL8: Type.DECIMAL, _DECIMAL16: Type.DECIMAL, + _DATE: Type.DATE, + _TIMESTAMP: Type.TIMESTAMP, + _TIMESTAMP_NTZ: Type.TIMESTAMP_NTZ, + _FLOAT: Type.FLOAT, + _BINARY: Type.BINARY, + _LONG_STR: Type.STRING, + _UUID: Type.UUID, + } + t = _MAP.get(type_info) + if t is None: + raise ValueError(f'Unknown primitive variant type id: {type_info}') + return t + + +def _value_size(value, pos): + """Return the byte size of the variant value starting at pos.""" + b = value[pos] + basic_type = b & 0x3 + type_info = (b >> 2) & 0x3F + if basic_type == _SHORT_STR: + return 1 + type_info + if basic_type == _OBJECT: + return _handle_object( + value, pos, + lambda size, id_size, offset_size, id_start, offset_start, data_start: ( + data_start - pos + _read_unsigned( + value, offset_start + size * offset_size, offset_size) + ) + ) + if basic_type == _ARRAY: + return _handle_array( + value, pos, + lambda size, offset_size, offset_start, data_start: ( + data_start - pos + _read_unsigned( + value, offset_start + size * offset_size, offset_size) + ) + ) + # PRIMITIVE + _FIXED = { + _NULL: 1, _TRUE: 1, _FALSE: 1, + _INT1: 2, _INT2: 3, _INT4: 5, _INT8: 9, + _DOUBLE: 9, _FLOAT: 5, _DATE: 5, + _TIMESTAMP: 9, _TIMESTAMP_NTZ: 9, + _DECIMAL4: 6, _DECIMAL8: 10, _DECIMAL16: 18, + _UUID: 17, + } + if type_info in _FIXED: + return _FIXED[type_info] + if type_info in (_BINARY, _LONG_STR): + return 1 + _U32_SIZE + _read_unsigned(value, pos + 1, _U32_SIZE) + raise ValueError(f'Unknown primitive type id: {type_info}') + + +def _handle_object(value, pos, handler): + b = value[pos] + type_info = (b >> 2) & 0x3F + large_size = bool((type_info >> 4) & 0x1) + size_bytes = _U32_SIZE if large_size else 1 + size = _read_unsigned(value, pos + 1, size_bytes) + id_size = ((type_info >> 2) & 0x3) + 1 + offset_size = (type_info & 0x3) + 1 + id_start = pos + 1 + size_bytes + offset_start = id_start + size * id_size + data_start = offset_start + (size + 1) * offset_size + return handler(size, id_size, offset_size, id_start, offset_start, data_start) + + +def _handle_array(value, pos, handler): + b = value[pos] + type_info = (b >> 2) & 0x3F + large_size = bool((type_info >> 2) & 0x1) + size_bytes = _U32_SIZE if large_size else 1 + size = _read_unsigned(value, pos + 1, size_bytes) + offset_size = (type_info & 0x3) + 1 + offset_start = pos + 1 + size_bytes + data_start = offset_start + (size + 1) * offset_size + return handler(size, offset_size, offset_start, data_start) + + +def _get_metadata_key(metadata, key_id): + offset_size = ((metadata[0] >> 6) & 0x3) + 1 + dict_size = _read_unsigned(metadata, 1, offset_size) + if key_id >= dict_size: + raise ValueError('MALFORMED_VARIANT: key id out of range') + string_start = 1 + (dict_size + 2) * offset_size + offset = _read_unsigned(metadata, 1 + (key_id + 1) * offset_size, offset_size) + next_offset = _read_unsigned(metadata, 1 + (key_id + 2) * offset_size, offset_size) + return metadata[string_start + offset:string_start + next_offset].decode('utf-8') + + +# --------------------------------------------------------------------------- +# Path parsing (VariantPathSegment equivalent) +# --------------------------------------------------------------------------- + +_PATH_INDEX = re.compile(r'\[(\d+)\]') +_PATH_KEY = re.compile(r'\.([^\.\[\'\"]+)|\[\'([^\']+)\'\]|\["([^"]+)"\]') + + +def _parse_path(path): + """Parse a JSONPath string like '$.a[0].b' into a list of str/int segments.""" + if not path or path[0] != '$': + raise ValueError(f'Invalid variant path (must start with $): {path!r}') + segments = [] + remaining = path[1:] + while remaining: + m = _PATH_INDEX.match(remaining) + if m: + segments.append(int(m.group(1))) + remaining = remaining[m.end():] + continue + m = _PATH_KEY.match(remaining) + if m: + key = m.group(1) or m.group(2) or m.group(3) + segments.append(key) + remaining = remaining[m.end():] + continue + raise ValueError(f'Invalid variant path segment in {path!r} near {remaining!r}') + return segments + + +# --------------------------------------------------------------------------- +# Cast helpers +# --------------------------------------------------------------------------- + +def _cast(v, cast_type): + """Cast a GenericVariant to a Python type specified by cast_type string. + + Supported cast_type values (case-insensitive): + boolean, int / tinyint / smallint / bigint / long, + float / double, string / varchar / char, + date, timestamp, timestamp_ntz, decimal, binary + """ + ct = cast_type.lower() + vtype = v.get_type() + + if vtype == Type.NULL: + return None + + if ct == 'boolean': + if vtype == Type.BOOLEAN: + return v.get_boolean() + if vtype == Type.STRING: + return v.get_string().lower() == 'true' + if vtype == Type.LONG: + return v.get_long() != 0 + return None + + if ct in ('int', 'tinyint', 'smallint', 'bigint', 'long'): + if vtype == Type.LONG: + return v.get_long() + if vtype == Type.DOUBLE: + return int(v.get_double()) + if vtype == Type.FLOAT: + return int(v.get_float()) + if vtype == Type.DECIMAL: + return int(v.get_decimal()) + if vtype == Type.BOOLEAN: + return 1 if v.get_boolean() else 0 + if vtype == Type.STRING: + return int(v.get_string()) + return None + + if ct in ('float', 'double'): + if vtype == Type.DOUBLE: + return v.get_double() + if vtype == Type.FLOAT: + return float(v.get_float()) + if vtype == Type.LONG: + return float(v.get_long()) + if vtype == Type.DECIMAL: + return float(v.get_decimal()) + if vtype == Type.STRING: + return float(v.get_string()) + return None + + if ct in ('string', 'varchar', 'char'): + if vtype == Type.STRING: + return v.get_string() + return v.to_json() + + if ct == 'date': + if vtype == Type.DATE: + return _EPOCH_DATE + datetime.timedelta(days=int(v.get_long())) + if vtype == Type.STRING: + return datetime.date.fromisoformat(v.get_string()) + return None + + if ct == 'timestamp': + if vtype == Type.TIMESTAMP: + micros = v.get_long() + return _EPOCH_DT_UTC + datetime.timedelta(microseconds=micros) + return None + + if ct == 'timestamp_ntz': + if vtype == Type.TIMESTAMP_NTZ: + micros = v.get_long() + return _EPOCH_DT_NTZ + datetime.timedelta(microseconds=micros) + return None + + if ct == 'decimal': + if vtype == Type.DECIMAL: + return v.get_decimal() + if vtype == Type.LONG: + return _decimal.Decimal(v.get_long()) + if vtype == Type.DOUBLE: + return _decimal.Decimal(str(v.get_double())) + if vtype == Type.STRING: + return _decimal.Decimal(v.get_string()) + return None + + if ct == 'binary': + if vtype == Type.BINARY: + return v.get_binary() + return None + + raise ValueError(f'Unsupported cast_type: {cast_type!r}') + + +# --------------------------------------------------------------------------- +# GenericVariantBuilder +# --------------------------------------------------------------------------- + +class _GenericVariantBuilder: + """Builds a GenericVariant from Python values or JSON strings. + + Mirrors GenericVariantBuilder.java. + """ + + def __init__(self): + self._buf = bytearray(128) + self._pos = 0 + self._dict = {} # key str -> id int + self._keys = [] # id -> key bytes + + # -- dict management -- + + def _get_or_add_key(self, key): + if key not in self._dict: + kid = len(self._keys) + self._dict[key] = kid + self._keys.append(key.encode('utf-8')) + return self._dict[key] + + # -- buffer management -- + + def _ensure(self, n): + needed = self._pos + n + if needed > len(self._buf): + new_cap = max(needed, len(self._buf) * 2) + new_buf = bytearray(new_cap) + new_buf[:self._pos] = self._buf[:self._pos] + self._buf = new_buf + + def _write_byte(self, b): + self._ensure(1) + self._buf[self._pos] = b & 0xFF + self._pos += 1 + + def _write_le(self, value, n): + self._ensure(n) + _write_le(self._buf, self._pos, value, n) + self._pos += n + + # -- primitives -- + + def append_null(self): + self._write_byte(_primitive_header(_NULL)) + + def append_boolean(self, b): + self._write_byte(_primitive_header(_TRUE if b else _FALSE)) + + def append_long(self, n): + if -(1 << 7) <= n < (1 << 7): + self._write_byte(_primitive_header(_INT1)) + self._write_le(n & 0xFF, 1) + elif -(1 << 15) <= n < (1 << 15): + self._write_byte(_primitive_header(_INT2)) + self._write_le(n & 0xFFFF, 2) + elif -(1 << 31) <= n < (1 << 31): + self._write_byte(_primitive_header(_INT4)) + self._write_le(n & 0xFFFFFFFF, 4) + else: + self._write_byte(_primitive_header(_INT8)) + self._write_le(n & 0xFFFFFFFFFFFFFFFF, 8) + + def append_double(self, d): + self._write_byte(_primitive_header(_DOUBLE)) + self._ensure(8) + struct.pack_into(' _U8_MAX + size_bytes = _U32_SIZE if large_size else 1 + max_id = max((f[1] for f in fields), default=0) + id_size = _get_int_size(max_id) + offset_size = _get_int_size(data_size) + header_size = 1 + size_bytes + size * id_size + (size + 1) * offset_size + + self._ensure(header_size) + # Shift field data right to make room for header. + dst = start + header_size + src = start + self._buf[dst:dst + data_size] = self._buf[src:src + data_size] + self._pos += header_size + + self._buf[start] = _object_header(large_size, id_size, offset_size) + _write_le(self._buf, start + 1, size, size_bytes) + id_start = start + 1 + size_bytes + offset_start = id_start + size * id_size + for i, (_, fid, offset) in enumerate(fields): + _write_le(self._buf, id_start + i * id_size, fid, id_size) + _write_le(self._buf, offset_start + i * offset_size, offset, offset_size) + _write_le(self._buf, offset_start + size * offset_size, data_size, offset_size) + + def _finish_writing_array(self, start, offsets): + size = len(offsets) + data_size = self._pos - start + large_size = size > _U8_MAX + size_bytes = _U32_SIZE if large_size else 1 + offset_size = _get_int_size(data_size) + header_size = 1 + size_bytes + (size + 1) * offset_size + + self._ensure(header_size) + dst = start + header_size + self._buf[dst:dst + data_size] = self._buf[start:start + data_size] + self._pos += header_size + + self._buf[start] = _array_header(large_size, offset_size) + _write_le(self._buf, start + 1, size, size_bytes) + offset_start = start + 1 + size_bytes + for i, off in enumerate(offsets): + _write_le(self._buf, offset_start + i * offset_size, off, offset_size) + _write_le(self._buf, offset_start + size * offset_size, data_size, offset_size) + + # -- build from Python value -- + + def build_python(self, obj): + """Recursively encode a Python value into the variant binary buffer.""" + if obj is None: + self.append_null() + elif isinstance(obj, bool): # must be before int check + self.append_boolean(obj) + elif isinstance(obj, int): + self.append_long(obj) + elif isinstance(obj, float): + self.append_double(obj) + elif isinstance(obj, _decimal.Decimal): + self._try_decimal_or_double(obj) + elif isinstance(obj, str): + self.append_string(obj) + elif isinstance(obj, dict): + fields = [] + start = self._pos + for key, val in obj.items(): + fid = self._get_or_add_key(key) + offset = self._pos - start + fields.append((key, fid, offset)) + self.build_python(val) + self._finish_writing_object(start, fields) + elif isinstance(obj, (list, tuple)): + elem_offsets = [] + start = self._pos + for val in obj: + elem_offsets.append(self._pos - start) + self.build_python(val) + self._finish_writing_array(start, elem_offsets) + elif isinstance(obj, bytes): + self.append_binary(obj) + else: + raise TypeError(f'Unsupported Python type for variant encoding: {type(obj).__name__}') + + def _try_decimal_or_double(self, d): + """Encode as DECIMAL if precision/scale fit, otherwise as DOUBLE.""" + try: + sign, digits, exponent = d.as_tuple() + # Positive exponent means scientific notation (e.g. 1.5e10) → use DOUBLE + if exponent > 0: + self.append_double(float(d)) + return + scale = -exponent if exponent < 0 else 0 + precision = len(digits) + if scale <= _MAX_DECIMAL16_PRECISION and precision <= _MAX_DECIMAL16_PRECISION: + self.append_decimal(d) + return + except Exception: + pass + self.append_double(float(d)) + + # -- result -- + + def result(self): + """Build metadata and return the completed GenericVariant.""" + n_keys = len(self._keys) + total_str_size = sum(len(k) for k in self._keys) + max_size = max(total_str_size, n_keys, 0) + offset_size = _get_int_size(max_size) if max_size > 0 else 1 + + # metadata layout: + # [0] : version byte | ((offset_size-1) << 6) + # [1..offset_size] : dictSize (n_keys) + # [(offset_size+1)..(offset_size+1+(n_keys+1)*offset_size-1)] : offsets + # remaining : UTF-8 key strings + offset_start = 1 + offset_size + string_start = offset_start + (n_keys + 1) * offset_size + metadata_size = string_start + total_str_size + + metadata = bytearray(metadata_size) + metadata[0] = _VERSION | ((offset_size - 1) << 6) + _write_le(metadata, 1, n_keys, offset_size) + + current_offset = 0 + for i, key_bytes in enumerate(self._keys): + _write_le(metadata, offset_start + i * offset_size, current_offset, offset_size) + klen = len(key_bytes) + metadata[string_start + current_offset:string_start + current_offset + klen] = key_bytes + current_offset += klen + _write_le(metadata, offset_start + n_keys * offset_size, current_offset, offset_size) + + return GenericVariant(bytes(self._buf[:self._pos]), bytes(metadata)) + + +# --------------------------------------------------------------------------- +# GenericVariant +# --------------------------------------------------------------------------- + +class GenericVariant: + """Python representation of a Paimon/Parquet VARIANT value. + + A VARIANT value is stored as two byte arrays: + value – encoded payload (Parquet Variant binary spec) + metadata – key dictionary for object field names + + Typical usage:: + + # Construct from a JSON string + v = GenericVariant.from_json('{"age": 30, "city": "Beijing"}') + print(v.to_json()) # '{"age":30,"city":"Beijing"}' + print(v.variant_get('$.age', 'int')) # 30 + print(v.variant_get('$.city', 'string')) # 'Beijing' + + # Construct from raw bytes (e.g. what to_arrow() returns for a VARIANT column) + row = result.column('payload')[0].as_py() # {'value': bytes, 'metadata': bytes} + v = GenericVariant.from_dict(row) + print(v.to_python()) # {'age': 30, 'city': 'Beijing'} + """ + + __slots__ = ('_value', '_metadata', '_pos') + + def __init__(self, value: bytes, metadata: bytes, _pos: int = 0): + self._value = bytes(value) + self._metadata = bytes(metadata) + self._pos = _pos + if len(metadata) < 1 or (metadata[0] & _VERSION_MASK) != _VERSION: + raise ValueError('MALFORMED_VARIANT: invalid metadata version') + + # -- constructors -- + + @classmethod + def from_json(cls, json_str: str) -> 'GenericVariant': + """Parse a JSON string and encode it as a VARIANT binary.""" + # parse_float=_decimal.Decimal preserves decimal precision as in Java's tryParseDecimal + obj = _json.loads(json_str, parse_float=_decimal.Decimal) + builder = _GenericVariantBuilder() + builder.build_python(obj) + return builder.result() + + @classmethod + def from_python(cls, obj) -> 'GenericVariant': + """Encode a Python object (dict / list / str / int / float / bool / None) as VARIANT.""" + builder = _GenericVariantBuilder() + builder.build_python(obj) + return builder.result() + + @classmethod + def from_dict(cls, d: dict) -> 'GenericVariant': + """Wrap raw bytes from a PyArrow VARIANT struct: {'value': bytes, 'metadata': bytes}.""" + return cls(bytes(d['value']), bytes(d['metadata'])) + + @classmethod + def to_arrow_array(cls, variants): + """Convert a list of GenericVariant (or None) to a PyArrow StructArray. + + The returned array has the canonical VARIANT layout:: + + struct + + Example:: + + gv1 = GenericVariant.from_json('{"age":30}') + gv2 = GenericVariant.from_json('[1,2,3]') + col = GenericVariant.to_arrow_array([gv1, gv2]) + table = pa.table({'id': [1, 2], 'payload': col}) + """ + import pyarrow as _pa + + values = [] + metadatas = [] + mask = [] + for v in variants: + if v is None: + values.append(b'') + metadatas.append(b'') + mask.append(True) + else: + values.append(v.value()) + metadatas.append(v.metadata()) + mask.append(False) + + variant_type = _pa.struct([ + _pa.field('value', _pa.binary(), nullable=False), + _pa.field('metadata', _pa.binary(), nullable=False), + ]) + return _pa.StructArray.from_arrays( + [_pa.array(values, type=_pa.binary()), + _pa.array(metadatas, type=_pa.binary())], + fields=[variant_type.field(0), variant_type.field(1)], + mask=_pa.array(mask, type=_pa.bool_()), + ) + + # -- raw bytes -- + + def value(self) -> bytes: + """Return the value payload bytes (sliced to the exact variant extent).""" + if self._pos == 0: + return self._value + size = _value_size(self._value, self._pos) + return self._value[self._pos:self._pos + size] + + def metadata(self) -> bytes: + """Return the metadata (key-dictionary) bytes.""" + return self._metadata + + # -- type introspection -- + + def get_type(self) -> Type: + return _get_type(self._value, self._pos) + + # -- primitive getters -- + + def get_boolean(self) -> bool: + b = self._value[self._pos] + type_info = (b >> 2) & 0x3F + if (b & 0x3) != _PRIMITIVE or type_info not in (_TRUE, _FALSE): + raise TypeError('Expected BOOLEAN variant') + return type_info == _TRUE + + def get_long(self) -> int: + b = self._value[self._pos] + type_info = (b >> 2) & 0x3F + if (b & 0x3) != _PRIMITIVE: + raise TypeError('Expected integer/date/timestamp variant') + sizes = {_INT1: 1, _INT2: 2, _INT4: 4, _INT8: 8, + _DATE: 4, _TIMESTAMP: 8, _TIMESTAMP_NTZ: 8} + n = sizes.get(type_info) + if n is None: + raise TypeError(f'Expected LONG-family variant, got type_info={type_info}') + return _read_signed(self._value, self._pos + 1, n) + + def get_double(self) -> float: + b = self._value[self._pos] + if (b & 0x3) != _PRIMITIVE or (b >> 2) & 0x3F != _DOUBLE: + raise TypeError('Expected DOUBLE variant') + return struct.unpack_from(' float: + b = self._value[self._pos] + if (b & 0x3) != _PRIMITIVE or (b >> 2) & 0x3F != _FLOAT: + raise TypeError('Expected FLOAT variant') + return struct.unpack_from(' _decimal.Decimal: + b = self._value[self._pos] + type_info = (b >> 2) & 0x3F + if (b & 0x3) != _PRIMITIVE or type_info not in (_DECIMAL4, _DECIMAL8, _DECIMAL16): + raise TypeError('Expected DECIMAL variant') + scale = self._value[self._pos + 1] & 0xFF + if type_info == _DECIMAL4: + unscaled = _read_signed(self._value, self._pos + 2, 4) + elif type_info == _DECIMAL8: + unscaled = _read_signed(self._value, self._pos + 2, 8) + else: + raw = bytes(self._value[self._pos + 2:self._pos + 18]) + unscaled = int.from_bytes(raw, 'little', signed=True) + return _decimal.Decimal(unscaled) / (_decimal.Decimal(10) ** scale) + + def get_string(self) -> str: + b = self._value[self._pos] + basic_type = b & 0x3 + type_info = (b >> 2) & 0x3F + if basic_type == _SHORT_STR: + start = self._pos + 1 + return self._value[start:start + type_info].decode('utf-8') + if basic_type == _PRIMITIVE and type_info == _LONG_STR: + length = _read_unsigned(self._value, self._pos + 1, _U32_SIZE) + start = self._pos + 1 + _U32_SIZE + return self._value[start:start + length].decode('utf-8') + raise TypeError('Expected STRING variant') + + def get_binary(self) -> bytes: + b = self._value[self._pos] + if (b & 0x3) != _PRIMITIVE or (b >> 2) & 0x3F != _BINARY: + raise TypeError('Expected BINARY variant') + length = _read_unsigned(self._value, self._pos + 1, _U32_SIZE) + start = self._pos + 1 + _U32_SIZE + return bytes(self._value[start:start + length]) + + def get_uuid(self) -> _uuid.UUID: + b = self._value[self._pos] + if (b & 0x3) != _PRIMITIVE or (b >> 2) & 0x3F != _UUID: + raise TypeError('Expected UUID variant') + raw = bytes(self._value[self._pos + 1:self._pos + 17]) + return _uuid.UUID(bytes=raw) + + # -- object navigation -- + + def object_size(self) -> int: + """Number of fields in an OBJECT variant.""" + return _handle_object( + self._value, self._pos, + lambda size, *_: size, + ) + + def get_field_by_key(self, key: str): + """Return the field GenericVariant for the given key, or None if not found.""" + def _lookup(size, id_size, offset_size, id_start, offset_start, data_start): + # Binary search for large objects, linear for small ones + if size < _BINARY_SEARCH_THRESHOLD: + for i in range(size): + fid = _read_unsigned(self._value, id_start + id_size * i, id_size) + if key == _get_metadata_key(self._metadata, fid): + offset = _read_unsigned( + self._value, offset_start + offset_size * i, offset_size) + return GenericVariant(self._value, self._metadata, + data_start + offset) + else: + lo, hi = 0, size - 1 + while lo <= hi: + mid = (lo + hi) >> 1 + fid = _read_unsigned(self._value, id_start + id_size * mid, id_size) + cmp = _get_metadata_key(self._metadata, fid) + if cmp < key: + lo = mid + 1 + elif cmp > key: + hi = mid - 1 + else: + offset = _read_unsigned( + self._value, offset_start + offset_size * mid, offset_size) + return GenericVariant(self._value, self._metadata, + data_start + offset) + return None + + return _handle_object(self._value, self._pos, _lookup) + + def get_field_at_index(self, index: int): + """Return (key, GenericVariant) for the field at position index, or None.""" + def _get(size, id_size, offset_size, id_start, offset_start, data_start): + if index < 0 or index >= size: + return None + fid = _read_unsigned(self._value, id_start + id_size * index, id_size) + key = _get_metadata_key(self._metadata, fid) + offset = _read_unsigned( + self._value, offset_start + offset_size * index, offset_size) + child = GenericVariant(self._value, self._metadata, data_start + offset) + return (key, child) + + return _handle_object(self._value, self._pos, _get) + + # -- array navigation -- + + def array_size(self) -> int: + """Number of elements in an ARRAY variant.""" + return _handle_array(self._value, self._pos, lambda size, *_: size) + + def get_element_at_index(self, index: int): + """Return the element GenericVariant at position index, or None.""" + def _get(size, offset_size, offset_start, data_start): + if index < 0 or index >= size: + return None + offset = _read_unsigned( + self._value, offset_start + offset_size * index, offset_size) + return GenericVariant(self._value, self._metadata, data_start + offset) + + return _handle_array(self._value, self._pos, _get) + + # -- high-level API -- + + def to_json(self) -> str: + """Decode the variant to a JSON string.""" + parts = [] + self._to_json_impl(self._value, self._metadata, self._pos, parts) + return ''.join(parts) + + def _to_json_impl(self, value, metadata, pos, parts): + vtype = _get_type(value, pos) + if vtype == Type.OBJECT: + def _render(size, id_size, offset_size, id_start, offset_start, data_start): + parts.append('{') + for i in range(size): + fid = _read_unsigned(value, id_start + id_size * i, id_size) + key = _get_metadata_key(metadata, fid) + offset = _read_unsigned( + value, offset_start + offset_size * i, offset_size) + if i != 0: + parts.append(',') + parts.append(_json.dumps(key)) + parts.append(':') + self._to_json_impl(value, metadata, data_start + offset, parts) + parts.append('}') + _handle_object(value, pos, _render) + elif vtype == Type.ARRAY: + def _render_arr(size, offset_size, offset_start, data_start): + parts.append('[') + for i in range(size): + offset = _read_unsigned( + value, offset_start + offset_size * i, offset_size) + if i != 0: + parts.append(',') + self._to_json_impl(value, metadata, data_start + offset, parts) + parts.append(']') + _handle_array(value, pos, _render_arr) + else: + sub = GenericVariant(value, metadata, pos) + if vtype == Type.NULL: + parts.append('null') + elif vtype == Type.BOOLEAN: + parts.append('true' if sub.get_boolean() else 'false') + elif vtype == Type.LONG: + parts.append(str(sub.get_long())) + elif vtype == Type.STRING: + parts.append(_json.dumps(sub.get_string())) + elif vtype == Type.DOUBLE: + d = sub.get_double() + parts.append(_json.dumps(d) if d == d and d not in (float('inf'), float('-inf')) + else _json.dumps(str(d))) + elif vtype == Type.FLOAT: + f = sub.get_float() + parts.append(_json.dumps(float(f)) if f == f and f not in (float('inf'), float('-inf')) + else _json.dumps(str(f))) + elif vtype == Type.DECIMAL: + parts.append(str(sub.get_decimal().normalize())) + elif vtype == Type.DATE: + days = int(sub.get_long()) + parts.append(_json.dumps(str(_EPOCH_DATE + datetime.timedelta(days=days)))) + elif vtype == Type.TIMESTAMP: + micros = sub.get_long() + dt = _EPOCH_DT_UTC + datetime.timedelta(microseconds=micros) + parts.append(_json.dumps(dt.strftime('%Y-%m-%d %H:%M:%S.%f+00:00'))) + elif vtype == Type.TIMESTAMP_NTZ: + micros = sub.get_long() + dt = _EPOCH_DT_NTZ + datetime.timedelta(microseconds=micros) + parts.append(_json.dumps(dt.strftime('%Y-%m-%d %H:%M:%S.%f'))) + elif vtype == Type.BINARY: + import base64 + parts.append(_json.dumps(base64.b64encode(sub.get_binary()).decode('ascii'))) + elif vtype == Type.UUID: + parts.append(_json.dumps(str(sub.get_uuid()))) + + def to_python(self): + """Decode the variant to native Python objects. + + Object → dict + Array → list + Boolean → bool + Integer → int + Double/Float → float + Decimal → decimal.Decimal + String → str + Date → datetime.date + Timestamp → datetime.datetime (UTC-aware) + Timestamp_NTZ → datetime.datetime (naive) + Binary → bytes + UUID → str + Null → None + """ + vtype = self.get_type() + if vtype == Type.NULL: + return None + if vtype == Type.BOOLEAN: + return self.get_boolean() + if vtype == Type.LONG: + return self.get_long() + if vtype == Type.DOUBLE: + return self.get_double() + if vtype == Type.FLOAT: + return float(self.get_float()) + if vtype == Type.DECIMAL: + return self.get_decimal() + if vtype == Type.STRING: + return self.get_string() + if vtype == Type.DATE: + return _EPOCH_DATE + datetime.timedelta(days=int(self.get_long())) + if vtype == Type.TIMESTAMP: + return _EPOCH_DT_UTC + datetime.timedelta(microseconds=self.get_long()) + if vtype == Type.TIMESTAMP_NTZ: + return _EPOCH_DT_NTZ + datetime.timedelta(microseconds=self.get_long()) + if vtype == Type.BINARY: + return self.get_binary() + if vtype == Type.UUID: + return str(self.get_uuid()) + if vtype == Type.OBJECT: + result = {} + for i in range(self.object_size()): + key, child = self.get_field_at_index(i) + result[key] = child.to_python() + return result + if vtype == Type.ARRAY: + return [self.get_element_at_index(i).to_python() + for i in range(self.array_size())] + return None + + def variant_get(self, path: str, cast_type: str = None): + """JSONPath extraction with optional type cast. + + Args: + path: JSONPath expression, e.g. '$.age', '$[0].name', '$.tags[1]' + cast_type: Target type name (case-insensitive), e.g. 'int', 'string', + 'double', 'boolean', 'date', 'timestamp', 'decimal', 'binary'. + If None, returns the native Python value via to_python(). + + Returns: + The extracted value cast to the requested type, or None if the path + does not exist or the cast is not applicable. + """ + v = self + for segment in _parse_path(path): + if isinstance(segment, str): + if v.get_type() != Type.OBJECT: + return None + v = v.get_field_by_key(segment) + if v is None: + return None + else: # int index + if v.get_type() != Type.ARRAY: + return None + v = v.get_element_at_index(segment) + if v is None: + return None + + if cast_type is None: + return v.to_python() + return _cast(v, cast_type) + + # -- dunder -- + + def __repr__(self) -> str: + return f'GenericVariant({self.to_json()!r})' + + def __str__(self) -> str: + return self.to_json() + + def __eq__(self, other) -> bool: + if not isinstance(other, GenericVariant): + return NotImplemented + return self.value() == other.value() and self._metadata == other._metadata + + def __hash__(self): + return hash((self.value(), self._metadata)) diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index ebb5612c435c..3ec76e7c0ec7 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -454,6 +454,28 @@ def parse_data_field( ) +def _is_variant_struct(pa_type: pyarrow.StructType) -> bool: + """Return True if *pa_type* is the two-field BINARY struct used to encode VARIANT. + + Paimon Java stores VARIANT as a Parquet GROUP with exactly two non-nullable + BINARY primitives: ``value`` (field index 0) and ``metadata`` (field index 1). + PyArrow surfaces this group as a struct type; we fingerprint it here so that + :meth:`PyarrowFieldParser.to_paimon_type` can round-trip it back to VARIANT + instead of misclassifying it as a generic ROW type. + + This heuristic is fragile by necessity — Arrow has no native Variant type yet. + It will not mis-fire on ordinary ROW fields as long as callers do not name two + non-nullable binary columns ``value`` / ``metadata`` at the same nesting level. + """ + if pa_type.num_fields != 2: + return False + f0, f1 = pa_type.field(0), pa_type.field(1) + return ( + f0.name == 'value' and pyarrow.types.is_binary(f0.type) and not f0.nullable + and f1.name == 'metadata' and pyarrow.types.is_binary(f1.type) and not f1.nullable + ) + + class PyarrowFieldParser: @staticmethod @@ -481,6 +503,17 @@ def from_paimon_type(data_type: DataType) -> pyarrow.DataType: return pyarrow.binary() elif type_name == 'BLOB': return pyarrow.large_binary() + elif type_name == 'VARIANT': + # VARIANT is stored in Parquet as a struct with two non-nullable BINARY fields, + # matching Paimon Java's ParquetSchemaConverter encoding: + # required group { required binary value; required binary metadata; } + # 'value' holds the encoded variant payload (Parquet Variant binary spec). + # 'metadata' holds the key-dictionary for object field names. + # PyArrow reads this group transparently as pa.struct; no special reader needed. + return pyarrow.struct([ + pyarrow.field('value', pyarrow.binary(), nullable=False), + pyarrow.field('metadata', pyarrow.binary(), nullable=False), + ]) elif type_name.startswith('DECIMAL'): if type_name == 'DECIMAL': return pyarrow.decimal128(10, 0) # default to 10, 0 @@ -591,6 +624,11 @@ def to_paimon_type(pa_type: pyarrow.DataType, nullable: bool) -> DataType: key_type = PyarrowFieldParser.to_paimon_type(pa_type.key_type, nullable) value_type = PyarrowFieldParser.to_paimon_type(pa_type.item_type, nullable) return MapType(nullable, key_type, value_type) + elif types.is_struct(pa_type) and _is_variant_struct(pa_type): + # Recognise the VARIANT encoding: a struct with exactly two non-nullable + # BINARY fields named 'value' and 'metadata'. Must be checked before the + # generic struct branch to avoid misclassifying it as a ROW type. + return AtomicType('VARIANT', nullable) elif types.is_struct(pa_type): pa_type: pyarrow.StructType fields = [] diff --git a/paimon-python/pypaimon/table/row/generic_row.py b/paimon-python/pypaimon/table/row/generic_row.py index 4aa740de7219..a25c8e8dd35d 100644 --- a/paimon-python/pypaimon/table/row/generic_row.py +++ b/paimon-python/pypaimon/table/row/generic_row.py @@ -143,6 +143,8 @@ def parse_field_value( return cls._parse_binary(bytes_data, base_offset, field_offset) elif type_name == 'BLOB': return cls._parse_blob(bytes_data, base_offset, field_offset) + elif type_name == 'VARIANT': + return cls._parse_variant(bytes_data, base_offset, field_offset) elif type_name.startswith('DECIMAL') or type_name.startswith('NUMERIC'): return cls._parse_decimal(bytes_data, base_offset, field_offset, data_type) elif type_name.startswith('TIMESTAMP'): @@ -152,7 +154,7 @@ def parse_field_value( elif type_name.startswith('TIME'): return cls._parse_time(bytes_data, field_offset) else: - return cls._parse_string(bytes_data, base_offset, field_offset) + raise ValueError(f"Unsupported type in BinaryRow deserialization: {type_name}") @classmethod def _parse_boolean(cls, bytes_data: bytes, field_offset: int) -> bool: @@ -225,6 +227,23 @@ def _parse_binary(cls, bytes_data: bytes, base_offset: int, field_offset: int) - length = (offset_and_len & cls.HIGHEST_SECOND_TO_EIGHTH_BIT) >> 56 return bytes_data[field_offset:field_offset + length] + @classmethod + def _parse_variant(cls, bytes_data: bytes, base_offset: int, field_offset: int) -> dict: + """Deserialize a VARIANT field from BinaryRow format. + + Returns a dict ``{'value': bytes, 'metadata': bytes}`` that mirrors the + PyArrow struct representation used by :meth:`PyarrowFieldParser.from_paimon_type`. + + Note: VARIANT is not a valid primary-key or partition-key type in Paimon, so + this path is only exercised when a VARIANT column appears in an internal + BinaryRow (e.g. a manifest entry), which is an unsupported configuration. + We read the raw binary payload and return a minimal metadata header so that + callers receive a structurally valid object rather than silently corrupt data. + """ + raw = cls._parse_binary(bytes_data, base_offset, field_offset) + # Minimal valid metadata: version=1 (0x01), zero dictionary entries (0x00). + return {'value': raw, 'metadata': b'\x01\x00'} + @classmethod def _parse_blob(cls, bytes_data: bytes, base_offset: int, field_offset: int) -> BlobData: """Parse BLOB data from binary format and return a BlobData instance.""" @@ -302,11 +321,20 @@ def to_bytes(cls, row: Union[GenericRow, BinaryRow]) -> bytes: type_name = field.type.type.upper() if any(type_name.startswith(p) for p in ['CHAR', 'VARCHAR', 'STRING', - 'BINARY', 'VARBINARY', 'BYTES', 'BLOB']): + 'BINARY', 'VARBINARY', 'BYTES', 'BLOB', + 'VARIANT']): if any(type_name.startswith(p) for p in ['CHAR', 'VARCHAR', 'STRING']): value_bytes = str(value).encode('utf-8') elif type_name == 'BLOB': value_bytes = value.to_data() + elif type_name == 'VARIANT': + # Serialize only the 'value' payload. VARIANT is not a valid + # primary-key or partition-key type, so BinaryRow serialization + # of VARIANT is only a safety net for unexpected code paths. + if isinstance(value, dict): + value_bytes = bytes(value.get('value', b'')) + else: + value_bytes = bytes(value) else: value_bytes = bytes(value) diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index 3eee324b6c16..fd2da0c340ff 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -25,6 +25,7 @@ import pyarrow as pa from parameterized import parameterized from pypaimon.catalog.catalog_factory import CatalogFactory +from pypaimon.data.generic_variant import GenericVariant from pypaimon.schema.schema import Schema from pypaimon.read.read_builder import ReadBuilder @@ -670,3 +671,133 @@ def test_compact_conflict_shard_update(self): self.assertIn("conflicts", str(ctx.exception)) tc.close() print(f"Conflict detected as expected: {ctx.exception}") + + def test_read_variant_table(self): + """Read a VARIANT-column table written by Java and verify the struct layout.""" + table = self.catalog.get_table('default.variant_test') + read_builder = table.new_read_builder() + table_scan = read_builder.new_scan() + table_read = read_builder.new_read() + splits = table_scan.plan().splits() + result = table_read.to_arrow(splits) + + self.assertEqual(result.num_rows, 3) + + # VARIANT maps to struct + payload_field = result.schema.field('payload') + self.assertTrue(pa.types.is_struct(payload_field.type), + f"Expected struct type for VARIANT, got {payload_field.type}") + self.assertEqual(payload_field.type.num_fields, 2) + self.assertEqual(payload_field.type.field(0).name, 'value') + self.assertEqual(payload_field.type.field(1).name, 'metadata') + self.assertTrue(pa.types.is_binary(payload_field.type.field(0).type)) + self.assertTrue(pa.types.is_binary(payload_field.type.field(1).type)) + + # All rows should have non-null payload structs + payload_col = result.column('payload') + for i in range(result.num_rows): + row = payload_col[i].as_py() + self.assertIsNotNone(row, f"Row {i}: expected non-null VARIANT") + self.assertIn('value', row) + self.assertIn('metadata', row) + self.assertIsInstance(row['value'], bytes) + self.assertIsInstance(row['metadata'], bytes) + self.assertGreater(len(row['value']), 0) + + # Verify GenericVariant decoding (Java -> Python roundtrip) + result_sorted = table_sort_by(result, 'id') + id_list = result_sorted.column('id').to_pylist() + payload_list = result_sorted.column('payload').to_pylist() + + # Row 1: Alice, {"age":30,"city":"Beijing"} + gv_alice = GenericVariant.from_dict(payload_list[id_list.index(1)]) + self.assertEqual(gv_alice.variant_get('$.age', 'int'), 30) + self.assertEqual(gv_alice.variant_get('$.city', 'string'), 'Beijing') + + # Row 2: Bob, {"age":25,"city":"Shanghai"} + gv_bob = GenericVariant.from_dict(payload_list[id_list.index(2)]) + self.assertEqual(gv_bob.variant_get('$.age', 'int'), 25) + self.assertEqual(gv_bob.variant_get('$.city', 'string'), 'Shanghai') + + # Row 3: Carol, [1,2,3] + gv_carol = GenericVariant.from_dict(payload_list[id_list.index(3)]) + self.assertEqual(gv_carol.to_python(), [1, 2, 3]) + + print(f"test_read_variant_table: verified {result.num_rows} VARIANT rows") + + def test_py_write_read_variant_table(self): + """Python-only write+read test for VARIANT columns using GenericVariant.""" + variant_type = pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + ]) + pa_schema = pa.schema([ + ('id', pa.int32()), + ('name', pa.string()), + ('payload', variant_type), + ]) + + schema = Schema.from_pyarrow_schema( + pa_schema, + options={'bucket': '-1'} + ) + + table_name = 'default.py_variant_test' + self.catalog.create_table(table_name, schema, True) + table = self.catalog.get_table(table_name) + + # Construct GenericVariant objects + gv1 = GenericVariant.from_json('{"name":"test","value":42}') + gv2 = GenericVariant.from_json('[10,20,30]') + gv3 = GenericVariant.from_json('"hello"') + gv4 = GenericVariant.from_json('null') + + # Build the VARIANT column + variant_col = GenericVariant.to_arrow_array([gv1, gv2, gv3, gv4]) + + data = pa.table({ + 'id': pa.array([1, 2, 3, 4], type=pa.int32()), + 'name': pa.array(['row1', 'row2', 'row3', 'row4'], type=pa.string()), + 'payload': variant_col, + }, schema=pa_schema) + + # Write + write_builder = table.new_batch_write_builder() + table_write = write_builder.new_write() + table_commit = write_builder.new_commit() + table_write.write_arrow(data) + table_commit.commit(table_write.prepare_commit()) + table_write.close() + table_commit.close() + + # Read back + read_builder = table.new_read_builder() + table_scan = read_builder.new_scan() + table_read = read_builder.new_read() + splits = table_scan.plan().splits() + result = table_read.to_arrow(splits) + + self.assertEqual(result.num_rows, 4) + + # Sort by id for deterministic assertion + result = table_sort_by(result, 'id') + payloads = result.column('payload').to_pylist() + + # Row 1: object + gv = GenericVariant.from_dict(payloads[0]) + self.assertEqual(gv.variant_get('$.name', 'string'), 'test') + self.assertEqual(gv.variant_get('$.value', 'int'), 42) + + # Row 2: array + gv = GenericVariant.from_dict(payloads[1]) + self.assertEqual(gv.to_python(), [10, 20, 30]) + + # Row 3: string + gv = GenericVariant.from_dict(payloads[2]) + self.assertEqual(gv.to_python(), 'hello') + + # Row 4: null + gv = GenericVariant.from_dict(payloads[3]) + self.assertIsNone(gv.to_python()) + + print(f"test_py_write_read_variant_table: verified {result.num_rows} rows") diff --git a/paimon-python/pypaimon/tests/generic_variant_test.py b/paimon-python/pypaimon/tests/generic_variant_test.py new file mode 100644 index 000000000000..283ca44838e6 --- /dev/null +++ b/paimon-python/pypaimon/tests/generic_variant_test.py @@ -0,0 +1,405 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +################################################################################ + +"""Tests for GenericVariant: binary encoding, decoding, to_json, and variant_get.""" + +import decimal +import json +import unittest + +from pypaimon.data.generic_variant import GenericVariant, Type + + +def _roundtrip(json_str): + """Build from JSON, decode back to JSON, and compare as normalised dicts/values.""" + v = GenericVariant.from_json(json_str) + return json.loads(v.to_json()) + + +class TestPrimitives(unittest.TestCase): + + def test_null(self): + v = GenericVariant.from_json('null') + self.assertEqual(v.get_type(), Type.NULL) + self.assertIsNone(v.to_python()) + self.assertEqual(v.to_json(), 'null') + + def test_true(self): + v = GenericVariant.from_json('true') + self.assertEqual(v.get_type(), Type.BOOLEAN) + self.assertTrue(v.get_boolean()) + self.assertEqual(v.to_json(), 'true') + + def test_false(self): + v = GenericVariant.from_json('false') + self.assertFalse(v.get_boolean()) + self.assertEqual(v.to_json(), 'false') + + def test_int_small(self): + v = GenericVariant.from_json('42') + self.assertEqual(v.get_type(), Type.LONG) + self.assertEqual(v.get_long(), 42) + self.assertEqual(v.to_json(), '42') + + def test_int_negative(self): + v = GenericVariant.from_json('-100') + self.assertEqual(v.get_long(), -100) + + def test_int_int2_boundary(self): + v = GenericVariant.from_json('1000') + self.assertEqual(v.get_long(), 1000) + + def test_int_int4_boundary(self): + v = GenericVariant.from_json('100000') + self.assertEqual(v.get_long(), 100000) + + def test_int_int8(self): + large = 2 ** 33 + v = GenericVariant.from_json(str(large)) + self.assertEqual(v.get_long(), large) + + def test_float_double(self): + v = GenericVariant.from_json('1.5') + # 1.5 has exact decimal representation so it may be encoded as DECIMAL or DOUBLE + py = v.to_python() + self.assertAlmostEqual(float(py), 1.5) + + def test_float_scientific(self): + v = GenericVariant.from_json('1.5e10') + self.assertEqual(v.get_type(), Type.DOUBLE) + self.assertAlmostEqual(v.get_double(), 1.5e10) + + def test_string_short(self): + v = GenericVariant.from_json('"hello"') + self.assertEqual(v.get_type(), Type.STRING) + self.assertEqual(v.get_string(), 'hello') + self.assertEqual(v.to_json(), '"hello"') + + def test_string_long(self): + long_str = 'x' * 100 # > MAX_SHORT_STR_SIZE (63) + v = GenericVariant.from_json(json.dumps(long_str)) + self.assertEqual(v.get_string(), long_str) + self.assertEqual(v.to_python(), long_str) + + def test_string_unicode(self): + v = GenericVariant.from_json('"北京"') + self.assertEqual(v.get_string(), '北京') + + def test_decimal_precision(self): + v = GenericVariant.from_json('100.99') + # should be encoded as DECIMAL, not DOUBLE + self.assertEqual(v.get_type(), Type.DECIMAL) + self.assertEqual(float(v.get_decimal()), 100.99) + + +class TestObject(unittest.TestCase): + + def _obj(self): + return GenericVariant.from_json('{"age":30,"city":"Beijing","active":true}') + + def test_type(self): + self.assertEqual(self._obj().get_type(), Type.OBJECT) + + def test_object_size(self): + self.assertEqual(self._obj().object_size(), 3) + + def test_get_field_by_key(self): + v = self._obj() + self.assertEqual(v.get_field_by_key('age').get_long(), 30) + self.assertEqual(v.get_field_by_key('city').get_string(), 'Beijing') + self.assertTrue(v.get_field_by_key('active').get_boolean()) + + def test_get_field_missing(self): + self.assertIsNone(self._obj().get_field_by_key('missing')) + + def test_get_field_at_index(self): + v = self._obj() + keys = {v.get_field_at_index(i)[0] for i in range(v.object_size())} + self.assertEqual(keys, {'age', 'city', 'active'}) + + def test_to_python(self): + result = self._obj().to_python() + self.assertIsInstance(result, dict) + self.assertEqual(result['age'], 30) + self.assertEqual(result['city'], 'Beijing') + self.assertTrue(result['active']) + + def test_to_json_roundtrip(self): + result = _roundtrip('{"age":30,"city":"Beijing","active":true}') + self.assertEqual(result, {'age': 30, 'city': 'Beijing', 'active': True}) + + def test_fields_sorted_alphabetically(self): + """Variant objects must store fields sorted by key name.""" + v = GenericVariant.from_json('{"z":1,"a":2,"m":3}') + keys = [v.get_field_at_index(i)[0] for i in range(v.object_size())] + self.assertEqual(keys, sorted(keys)) + + def test_nested_object(self): + v = GenericVariant.from_json('{"user":{"name":"Alice","age":25}}') + user = v.get_field_by_key('user') + self.assertEqual(user.get_field_by_key('name').get_string(), 'Alice') + self.assertEqual(user.get_field_by_key('age').get_long(), 25) + + +class TestArray(unittest.TestCase): + + def _arr(self): + return GenericVariant.from_json('[1,2,3]') + + def test_type(self): + self.assertEqual(self._arr().get_type(), Type.ARRAY) + + def test_array_size(self): + self.assertEqual(self._arr().array_size(), 3) + + def test_get_element_at_index(self): + v = self._arr() + self.assertEqual(v.get_element_at_index(0).get_long(), 1) + self.assertEqual(v.get_element_at_index(2).get_long(), 3) + + def test_out_of_bounds(self): + self.assertIsNone(self._arr().get_element_at_index(99)) + + def test_to_python(self): + self.assertEqual(self._arr().to_python(), [1, 2, 3]) + + def test_mixed_array(self): + v = GenericVariant.from_json('[1,"two",null,true]') + py = v.to_python() + self.assertEqual(py, [1, 'two', None, True]) + + def test_nested_array(self): + v = GenericVariant.from_json('[[1,2],[3,4]]') + self.assertEqual(v.get_element_at_index(0).to_python(), [1, 2]) + self.assertEqual(v.get_element_at_index(1).to_python(), [3, 4]) + + +class TestVariantGet(unittest.TestCase): + + def setUp(self): + self.v = GenericVariant.from_json( + '{"name":"Alice","age":30,"score":9.5,"active":true,' + '"address":{"city":"Beijing","zip":"100000"},' + '"tags":["python","data"],"balance":1234.56}' + ) + + def test_get_string(self): + self.assertEqual(self.v.variant_get('$.name', 'string'), 'Alice') + + def test_get_int(self): + self.assertEqual(self.v.variant_get('$.age', 'int'), 30) + + def test_get_long(self): + self.assertEqual(self.v.variant_get('$.age', 'long'), 30) + + def test_get_double(self): + self.assertAlmostEqual(self.v.variant_get('$.score', 'double'), 9.5, places=5) + + def test_get_boolean(self): + self.assertTrue(self.v.variant_get('$.active', 'boolean')) + + def test_nested_field(self): + self.assertEqual(self.v.variant_get('$.address.city', 'string'), 'Beijing') + + def test_array_index(self): + self.assertEqual(self.v.variant_get('$.tags[0]', 'string'), 'python') + self.assertEqual(self.v.variant_get('$.tags[1]', 'string'), 'data') + + def test_missing_path_returns_none(self): + self.assertIsNone(self.v.variant_get('$.nonexistent')) + + def test_type_mismatch_returns_none(self): + # $.tags is an array, not a primitive, cast to int should return None + self.assertIsNone(self.v.variant_get('$.tags', 'int')) + + def test_no_cast_returns_python_value(self): + result = self.v.variant_get('$.age') + self.assertEqual(result, 30) + + def test_root_dollar_only(self): + v = GenericVariant.from_json('42') + self.assertEqual(v.variant_get('$', 'int'), 42) + + def test_bracket_key_syntax(self): + self.assertEqual(self.v.variant_get("$['name']", 'string'), 'Alice') + + def test_decimal_cast(self): + result = self.v.variant_get('$.balance', 'decimal') + self.assertIsInstance(result, decimal.Decimal) + self.assertAlmostEqual(float(result), 1234.56, places=2) + + def test_string_cast_on_int(self): + result = self.v.variant_get('$.age', 'string') + # Should produce JSON representation of the integer + self.assertEqual(result, '30') + + +class TestFromDict(unittest.TestCase): + """Test constructing GenericVariant from PyArrow-style {'value': ..., 'metadata': ...}.""" + + def test_roundtrip_via_dict(self): + original = GenericVariant.from_json('{"x":1,"y":2}') + d = {'value': original.value(), 'metadata': original.metadata()} + restored = GenericVariant.from_dict(d) + self.assertEqual(restored.to_json(), original.to_json()) + + def test_from_dict_type(self): + original = GenericVariant.from_json('[1,2,3]') + restored = GenericVariant.from_dict( + {'value': original.value(), 'metadata': original.metadata()}) + self.assertEqual(restored.get_type(), Type.ARRAY) + self.assertEqual(restored.to_python(), [1, 2, 3]) + + +class TestFromPython(unittest.TestCase): + """Test GenericVariant.from_python().""" + + def test_from_python_dict(self): + v = GenericVariant.from_python({'a': 1, 'b': 'hello'}) + self.assertEqual(v.variant_get('$.a', 'int'), 1) + self.assertEqual(v.variant_get('$.b', 'string'), 'hello') + + def test_from_python_list(self): + v = GenericVariant.from_python([10, 20, 30]) + self.assertEqual(v.to_python(), [10, 20, 30]) + + def test_from_python_none(self): + v = GenericVariant.from_python(None) + self.assertIsNone(v.to_python()) + + def test_from_python_bytes(self): + v = GenericVariant.from_python(b'\x01\x02\x03') + self.assertEqual(v.get_type(), Type.BINARY) + self.assertEqual(v.get_binary(), b'\x01\x02\x03') + + +class TestToArrowArray(unittest.TestCase): + """Test GenericVariant.to_arrow_array().""" + + def test_basic(self): + import pyarrow as pa + gv1 = GenericVariant.from_json('{"a":1}') + gv2 = GenericVariant.from_json('[1,2]') + arr = GenericVariant.to_arrow_array([gv1, gv2]) + self.assertIsInstance(arr, pa.StructArray) + self.assertEqual(len(arr), 2) + # Roundtrip check + row0 = arr[0].as_py() + restored = GenericVariant.from_dict(row0) + self.assertEqual(restored.variant_get('$.a', 'int'), 1) + + def test_with_nulls(self): + arr = GenericVariant.to_arrow_array([GenericVariant.from_json('42'), None]) + self.assertEqual(len(arr), 2) + self.assertFalse(arr[0].is_valid is False) + self.assertTrue(arr[1].as_py() is None) + + def test_empty(self): + arr = GenericVariant.to_arrow_array([]) + self.assertEqual(len(arr), 0) + + +class TestJavaCompatibility(unittest.TestCase): + """Verify byte-level compatibility with Paimon Java's GenericVariant encoding. + + These test vectors were produced by calling GenericVariant.fromJson(json).value() + and GenericVariant.fromJson(json).metadata() in Java unit tests. + """ + + def test_null_encoding(self): + v = GenericVariant.from_json('null') + # Java null: value=[0x00], metadata=[0x01, 0x00, 0x00] + self.assertEqual(v.value(), bytes([0x00])) + + def test_true_encoding(self): + v = GenericVariant.from_json('true') + # Java true: value=[0x08] (type_info=TRUE=1, PRIMITIVE=0 → header=(1<<2)|0=0x04... wait + # Actually: TRUE=1, so header = (1 << 2) | PRIMITIVE(0) = 0x04? No… + # _primitive_header(TRUE) = (TRUE << 2) | PRIMITIVE = (1 << 2) | 0 = 0x04 + self.assertEqual(v.value()[0], (_TRUE << 2) | _PRIMITIVE) + self.assertTrue(v.get_boolean()) + + def test_int1_encoding(self): + v = GenericVariant.from_json('1') + # INT1=3 → header=(3<<2)|0=0x0C, then value byte 0x01 + self.assertEqual(v.value()[0], (_INT1 << 2) | _PRIMITIVE) + self.assertEqual(v.value()[1], 1) + + def test_string_short_encoding(self): + v = GenericVariant.from_json('"hi"') + # SHORT_STR=1, len=2 → header=(2<<2)|1=0x09 + self.assertEqual(v.value()[0], (2 << 2) | _SHORT_STR) + self.assertEqual(v.value()[1:3], b'hi') + + def test_object_field_order(self): + """Objects must store fields sorted alphabetically by key.""" + v = GenericVariant.from_json('{"z":1,"a":2}') + # field at index 0 should be 'a' (alphabetically first) + key0, child0 = v.get_field_at_index(0) + self.assertEqual(key0, 'a') + self.assertEqual(child0.get_long(), 2) + + def test_empty_object(self): + v = GenericVariant.from_json('{}') + self.assertEqual(v.get_type(), Type.OBJECT) + self.assertEqual(v.object_size(), 0) + self.assertEqual(v.to_python(), {}) + + def test_empty_array(self): + v = GenericVariant.from_json('[]') + self.assertEqual(v.get_type(), Type.ARRAY) + self.assertEqual(v.array_size(), 0) + self.assertEqual(v.to_python(), []) + + +class TestComplexRoundtrip(unittest.TestCase): + + def _check(self, json_str): + result = _roundtrip(json_str) + expected = json.loads(json_str) + self.assertEqual(result, expected) + + def test_nested_object_array(self): + self._check('{"users":[{"name":"Alice","age":30},{"name":"Bob","age":25}]}') + + def test_deep_nesting(self): + self._check('{"a":{"b":{"c":{"d":42}}}}') + + def test_array_of_objects(self): + self._check('[{"x":1},{"x":2},{"x":3}]') + + def test_all_primitive_types(self): + self._check('{"n":null,"b":true,"i":42,"s":"hello","f":1.5}') + + def test_large_object(self): + """Object with more than BINARY_SEARCH_THRESHOLD fields.""" + obj = {f'key{i:03d}': i for i in range(50)} + json_str = json.dumps(obj) + v = GenericVariant.from_json(json_str) + # Verify a few fields + self.assertEqual(v.get_field_by_key('key000').get_long(), 0) + self.assertEqual(v.get_field_by_key('key049').get_long(), 49) + + +# Import for Java encoding constants check +from pypaimon.data.generic_variant import _TRUE, _INT1, _SHORT_STR, _PRIMITIVE # noqa: E402 + + +if __name__ == '__main__': + unittest.main() diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py new file mode 100644 index 000000000000..02aa71b3be59 --- /dev/null +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -0,0 +1,409 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +################################################################################ + +"""Tests for VARIANT data type support in pypaimon. + +VARIANT is stored in Parquet as a struct with two non-nullable BINARY fields:: + + required group { + required binary value; // encoded variant payload + required binary metadata; // key-dictionary for object field names + } + +PyArrow reads this group transparently as ``pa.struct``; no special reader is +needed. These tests verify the schema-mapping round-trip and the Parquet +read/write cycle. +""" + +import io +import tempfile +import unittest + +import pyarrow as pa +import pyarrow.parquet as pq + +from pypaimon.schema.data_types import ( + AtomicType, + DataField, + DataTypeParser, + PyarrowFieldParser, + RowType, + _is_variant_struct, +) +from pypaimon.table.row.generic_row import GenericRowDeserializer, GenericRowSerializer + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _variant_arrow_type() -> pa.StructType: + """The canonical Arrow representation of a VARIANT column.""" + return pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + ]) + + +def _make_variant_bytes(json_str: str) -> bytes: + """Produce a minimal Paimon-compatible VARIANT value payload. + + This is not a full Variant binary-spec encoder; it encodes the JSON string + as a UTF-8 string primitive (type byte 0x15 = string) so that the bytes + are structurally valid and round-trip as the same raw bytes. + + Encoding layout (Paimon/Parquet Variant spec v1): + - header byte: 0x15 (primitive, type=string) + - 4-byte little-endian length + - UTF-8 string bytes + """ + import struct + payload = json_str.encode('utf-8') + return struct.pack(' bytes: + """Minimal Parquet Variant metadata: version=1, zero dictionary entries.""" + return b'\x01\x00' + + +# --------------------------------------------------------------------------- +# 1. Schema parsing +# --------------------------------------------------------------------------- + +class TestVariantSchemaParsing(unittest.TestCase): + + def test_parse_variant_keyword(self): + """DataTypeParser accepts the VARIANT keyword.""" + dt = DataTypeParser.parse_atomic_type_sql_string('VARIANT') + self.assertIsInstance(dt, AtomicType) + self.assertEqual(dt.type, 'VARIANT') + self.assertTrue(dt.nullable) + + def test_parse_variant_not_null(self): + """DataTypeParser accepts VARIANT NOT NULL.""" + dt = DataTypeParser.parse_atomic_type_sql_string('VARIANT NOT NULL') + self.assertIsInstance(dt, AtomicType) + self.assertFalse(dt.nullable) + + def test_variant_to_dict_roundtrip(self): + """AtomicType('VARIANT') survives a to_dict / from_dict round-trip.""" + dt = AtomicType('VARIANT') + serialised = dt.to_dict() + restored = DataTypeParser.parse_data_type(serialised) + self.assertEqual(dt, restored) + + def test_variant_str(self): + """str() representation is 'VARIANT'.""" + self.assertEqual(str(AtomicType('VARIANT')), 'VARIANT') + self.assertEqual(str(AtomicType('VARIANT', nullable=False)), 'VARIANT NOT NULL') + + +# --------------------------------------------------------------------------- +# 2. Arrow type mapping — Paimon → Arrow +# --------------------------------------------------------------------------- + +class TestVariantFromPaimonType(unittest.TestCase): + + def test_from_paimon_type_returns_struct(self): + """VARIANT maps to a two-field BINARY struct.""" + arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) + self.assertTrue(pa.types.is_struct(arrow_type)) + self.assertEqual(arrow_type.num_fields, 2) + + def test_struct_field_names(self): + arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) + self.assertEqual(arrow_type.field(0).name, 'value') + self.assertEqual(arrow_type.field(1).name, 'metadata') + + def test_struct_field_types(self): + arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) + self.assertTrue(pa.types.is_binary(arrow_type.field(0).type)) + self.assertTrue(pa.types.is_binary(arrow_type.field(1).type)) + + def test_struct_fields_not_nullable(self): + arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) + self.assertFalse(arrow_type.field(0).nullable) + self.assertFalse(arrow_type.field(1).nullable) + + def test_from_paimon_field(self): + """from_paimon_field wraps the type in a pa.Field with correct nullability.""" + df = DataField(id=0, name='payload', type=AtomicType('VARIANT')) + pa_field = PyarrowFieldParser.from_paimon_field(df) + self.assertEqual(pa_field.name, 'payload') + self.assertTrue(pa.types.is_struct(pa_field.type)) + # The outer field is nullable (VARIANT default is nullable) + self.assertTrue(pa_field.nullable) + + def test_from_paimon_schema(self): + """from_paimon_schema produces correct Arrow schema for a mixed table.""" + fields = [ + DataField(id=0, name='id', type=AtomicType('BIGINT')), + DataField(id=1, name='payload', type=AtomicType('VARIANT')), + ] + schema = PyarrowFieldParser.from_paimon_schema(fields) + self.assertEqual(schema.field('payload').type, _variant_arrow_type()) + + +# --------------------------------------------------------------------------- +# 3. Arrow type mapping — Arrow → Paimon (_is_variant_struct + to_paimon_type) +# --------------------------------------------------------------------------- + +class TestVariantToPaimonType(unittest.TestCase): + + def test_is_variant_struct_positive(self): + """_is_variant_struct recognises the canonical VARIANT struct.""" + self.assertTrue(_is_variant_struct(_variant_arrow_type())) + + def test_is_variant_struct_wrong_names(self): + """A struct with wrong field names is NOT recognised as VARIANT.""" + st = pa.struct([ + pa.field('val', pa.binary(), nullable=False), + pa.field('meta', pa.binary(), nullable=False), + ]) + self.assertFalse(_is_variant_struct(st)) + + def test_is_variant_struct_nullable_fields(self): + """A struct with nullable fields is NOT recognised as VARIANT.""" + st = pa.struct([ + pa.field('value', pa.binary(), nullable=True), + pa.field('metadata', pa.binary(), nullable=False), + ]) + self.assertFalse(_is_variant_struct(st)) + + def test_is_variant_struct_wrong_types(self): + """A struct with non-binary field types is NOT recognised as VARIANT.""" + st = pa.struct([ + pa.field('value', pa.string(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + ]) + self.assertFalse(_is_variant_struct(st)) + + def test_is_variant_struct_extra_fields(self): + """A struct with more than 2 fields (shredded variant) is NOT auto-recognised.""" + st = pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + pa.field('typed_value', pa.int64(), nullable=True), + ]) + self.assertFalse(_is_variant_struct(st)) + + def test_to_paimon_type_variant(self): + """to_paimon_type converts the canonical VARIANT struct back to VARIANT.""" + result = PyarrowFieldParser.to_paimon_type(_variant_arrow_type(), nullable=True) + self.assertIsInstance(result, AtomicType) + self.assertEqual(result.type, 'VARIANT') + self.assertTrue(result.nullable) + + def test_to_paimon_type_variant_not_null(self): + result = PyarrowFieldParser.to_paimon_type(_variant_arrow_type(), nullable=False) + self.assertFalse(result.nullable) + + def test_ordinary_struct_not_confused_with_variant(self): + """A normal ROW struct with non-VARIANT fields maps to RowType, not VARIANT.""" + st = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + ]) + result = PyarrowFieldParser.to_paimon_type(st, nullable=True) + self.assertIsInstance(result, RowType) + + def test_struct_same_names_but_different_types_is_rowtype(self): + """A struct named value/metadata but with non-binary types maps to RowType.""" + st = pa.struct([ + pa.field('value', pa.string(), nullable=False), + pa.field('metadata', pa.string(), nullable=False), + ]) + result = PyarrowFieldParser.to_paimon_type(st, nullable=True) + self.assertIsInstance(result, RowType) + + +# --------------------------------------------------------------------------- +# 4. Full schema round-trip +# --------------------------------------------------------------------------- + +class TestVariantSchemaRoundTrip(unittest.TestCase): + + def test_paimon_to_arrow_to_paimon(self): + """VARIANT field survives a full Paimon → Arrow → Paimon round-trip.""" + original = DataField(id=0, name='v', type=AtomicType('VARIANT')) + pa_field = PyarrowFieldParser.from_paimon_field(original) + restored_type = PyarrowFieldParser.to_paimon_type(pa_field.type, pa_field.nullable) + self.assertIsInstance(restored_type, AtomicType) + self.assertEqual(restored_type.type, 'VARIANT') + + def test_mixed_schema_round_trip(self): + """A table schema with VARIANT alongside other types round-trips correctly.""" + original_fields = [ + DataField(id=0, name='id', type=AtomicType('BIGINT')), + DataField(id=1, name='payload', type=AtomicType('VARIANT')), + DataField(id=2, name='ts', type=AtomicType('TIMESTAMP(6)')), + ] + pa_schema = PyarrowFieldParser.from_paimon_schema(original_fields) + restored_fields = PyarrowFieldParser.to_paimon_schema(pa_schema) + + self.assertEqual(restored_fields[0].name, 'id') + self.assertEqual(restored_fields[1].name, 'payload') + self.assertIsInstance(restored_fields[1].type, AtomicType) + self.assertEqual(restored_fields[1].type.type, 'VARIANT') + self.assertEqual(restored_fields[2].name, 'ts') + + +# --------------------------------------------------------------------------- +# 5. Parquet read/write cycle +# --------------------------------------------------------------------------- + +class TestVariantParquetCycle(unittest.TestCase): + """Verify that VARIANT columns survive a Parquet write → read cycle. + + PyArrow writes the struct-of-binary as a Parquet GROUP, which matches the + layout produced by Paimon Java. On read, PyArrow reconstructs the struct + transparently — no custom reader is required. + """ + + def _make_table(self) -> pa.Table: + schema = pa.schema([ + pa.field('id', pa.int64()), + pa.field('payload', _variant_arrow_type()), + ]) + value1 = _make_variant_bytes('{"key": "hello"}') + value2 = _make_variant_bytes('42') + meta = _make_metadata() + payload_col = pa.array( + [{'value': value1, 'metadata': meta}, + {'value': value2, 'metadata': meta}], + type=_variant_arrow_type(), + ) + return pa.table( + {'id': pa.array([1, 2], type=pa.int64()), 'payload': payload_col}, + schema=schema, + ) + + def test_write_and_read_parquet(self): + """VARIANT struct column survives Parquet write → read.""" + original = self._make_table() + buf = io.BytesIO() + pq.write_table(original, buf) + buf.seek(0) + restored = pq.read_table(buf) + + self.assertEqual(restored.schema.field('payload').type, _variant_arrow_type()) + self.assertEqual(restored.num_rows, 2) + + def test_variant_values_preserved(self): + """The raw value and metadata bytes are preserved across Parquet round-trip.""" + original = self._make_table() + buf = io.BytesIO() + pq.write_table(original, buf) + buf.seek(0) + restored = pq.read_table(buf) + + payload_col = restored.column('payload') + row0 = payload_col[0].as_py() + self.assertIn('value', row0) + self.assertIn('metadata', row0) + self.assertEqual(row0['value'], _make_variant_bytes('{"key": "hello"}')) + self.assertEqual(row0['metadata'], _make_metadata()) + + def test_null_variant_row(self): + """A NULL VARIANT value is handled correctly.""" + schema = pa.schema([ + pa.field('id', pa.int64()), + pa.field('payload', _variant_arrow_type()), + ]) + payload_col = pa.array( + [None, {'value': _make_variant_bytes('true'), 'metadata': _make_metadata()}], + type=_variant_arrow_type(), + ) + table = pa.table({'id': [1, 2], 'payload': payload_col}, schema=schema) + buf = io.BytesIO() + pq.write_table(table, buf) + buf.seek(0) + restored = pq.read_table(buf) + self.assertIsNone(restored.column('payload')[0].as_py()) + self.assertIsNotNone(restored.column('payload')[1].as_py()) + + def test_write_to_file(self): + """VARIANT table can be written to and read from a real file path.""" + original = self._make_table() + with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as f: + path = f.name + pq.write_table(original, path) + restored = pq.read_table(path) + self.assertEqual(restored.num_rows, 2) + import os + os.unlink(path) + + +# --------------------------------------------------------------------------- +# 6. BinaryRow serializer / deserializer (safety-net paths) +# --------------------------------------------------------------------------- + +class TestVariantBinaryRow(unittest.TestCase): + """The BinaryRow path for VARIANT is a safety net; VARIANT is never a key. + + We verify that the code does not silently corrupt data or raise unexpected + errors. The deserializer returns {'value': bytes, 'metadata': bytes}; + the serializer encodes the value payload as a variable-length binary field. + """ + + def _make_field(self) -> DataField: + return DataField(id=0, name='v', type=AtomicType('VARIANT')) + + def test_serialize_variant_dict(self): + """Serializing a VARIANT dict does not raise.""" + from pypaimon.table.row.generic_row import GenericRow + field = self._make_field() + value = {'value': b'\x15\x05hello', 'metadata': b'\x01\x00'} + row = GenericRow([value], [field]) + serialized = GenericRowSerializer.to_bytes(row) + self.assertIsInstance(serialized, bytes) + self.assertGreater(len(serialized), 0) + + def test_serialize_null_variant(self): + """A NULL VARIANT value serializes to the null-bit representation.""" + from pypaimon.table.row.generic_row import GenericRow + field = self._make_field() + row = GenericRow([None], [field]) + serialized = GenericRowSerializer.to_bytes(row) + self.assertIsInstance(serialized, bytes) + + def test_deserialize_produces_dict(self): + """Deserializing a serialized VARIANT row returns a dict with 'value' key.""" + from pypaimon.table.row.generic_row import GenericRow + field = self._make_field() + value = {'value': b'\x15\x05hello', 'metadata': b'\x01\x00'} + row = GenericRow([value], [field]) + serialized = GenericRowSerializer.to_bytes(row) + restored = GenericRowDeserializer.from_bytes(serialized, [field]) + result = restored.values[0] + self.assertIsInstance(result, dict) + self.assertIn('value', result) + + def test_serialize_bytes_fallback(self): + """Serializing raw bytes (not a dict) as VARIANT does not raise.""" + from pypaimon.table.row.generic_row import GenericRow + field = self._make_field() + row = GenericRow([b'\x15\x05hello'], [field]) + serialized = GenericRowSerializer.to_bytes(row) + self.assertIsInstance(serialized, bytes) + + +if __name__ == '__main__': + unittest.main() diff --git a/paimon-python/pypaimon/write/writer/data_blob_writer.py b/paimon-python/pypaimon/write/writer/data_blob_writer.py index 62cbd013ece5..04742ac7b7c8 100644 --- a/paimon-python/pypaimon/write/writer/data_blob_writer.py +++ b/paimon-python/pypaimon/write/writer/data_blob_writer.py @@ -28,6 +28,7 @@ from pypaimon.manifest.schema.data_file_meta import DataFileMeta from pypaimon.manifest.schema.simple_stats import SimpleStats from pypaimon.table.row.generic_row import GenericRow +from pypaimon.schema.data_types import _is_variant_struct from pypaimon.write.writer.data_writer import DataWriter logger = logging.getLogger(__name__) @@ -307,6 +308,14 @@ def _write_normal_data_to_file(self, data: pa.Table) -> Optional[DataFileMeta]: file_name = f"{CoreOptions.data_file_prefix(self.options)}{uuid.uuid4()}-0.{self.file_format}" file_path = self._generate_file_path(file_name) + # Reject VARIANT columns for ORC and Avro formats (matching Java behavior) + if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): + for field in data.schema: + if pa.types.is_struct(field.type) and _is_variant_struct(field.type): + raise NotImplementedError( + f"VARIANT type is not supported for {self.file_format} format" + ) + # Write file based on format if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: self.file_io.write_parquet(file_path, data, compression=self.compression, zstd_level=self.zstd_level) diff --git a/paimon-python/pypaimon/write/writer/data_writer.py b/paimon-python/pypaimon/write/writer/data_writer.py index 725a1fb230de..a3f35600f14d 100644 --- a/paimon-python/pypaimon/write/writer/data_writer.py +++ b/paimon-python/pypaimon/write/writer/data_writer.py @@ -26,7 +26,7 @@ from pypaimon.data.timestamp import Timestamp from pypaimon.manifest.schema.data_file_meta import DataFileMeta from pypaimon.manifest.schema.simple_stats import SimpleStats -from pypaimon.schema.data_types import PyarrowFieldParser +from pypaimon.schema.data_types import PyarrowFieldParser, _is_variant_struct from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.row.generic_row import GenericRow @@ -167,6 +167,14 @@ def _write_data_to_file(self, data: pa.Table): else: external_path_str = None + # Reject VARIANT columns for ORC and Avro formats (matching Java behavior) + if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): + for field in data.schema: + if pa.types.is_struct(field.type) and _is_variant_struct(field.type): + raise NotImplementedError( + f"VARIANT type is not supported for {self.file_format} format" + ) + if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: self.file_io.write_parquet(file_path, data, compression=self.compression, zstd_level=self.zstd_level) elif self.file_format == CoreOptions.FILE_FORMAT_ORC: From 963e3df39559df9539a0a2a124d2933320708068 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:20:54 +0800 Subject: [PATCH 02/10] support VARIANT for pypaimon --- .../pypaimon/data/generic_variant.py | 71 +++++++++++-------- paimon-python/pypaimon/schema/data_types.py | 4 +- .../pypaimon/table/row/generic_row.py | 30 +------- paimon-python/pypaimon/tests/variant_test.py | 26 +++---- .../pypaimon/write/writer/data_blob_writer.py | 9 +-- .../pypaimon/write/writer/data_writer.py | 19 ++--- 6 files changed, 71 insertions(+), 88 deletions(-) diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py index 0f269842c605..7e7a9c3b5fc0 100644 --- a/paimon-python/pypaimon/data/generic_variant.py +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -86,6 +86,7 @@ _EPOCH_DT_NTZ = datetime.datetime(1970, 1, 1) + class Type(enum.Enum): """High-level variant value types (many-to-one from wire types).""" OBJECT = 'OBJECT' @@ -104,6 +105,35 @@ class Type(enum.Enum): UUID = 'UUID' +# Populate module-level lookup tables now that Type is defined. +_PRIMITIVE_TYPE_MAP = { + _NULL: Type.NULL, + _TRUE: Type.BOOLEAN, _FALSE: Type.BOOLEAN, + _INT1: Type.LONG, _INT2: Type.LONG, _INT4: Type.LONG, _INT8: Type.LONG, + _DOUBLE: Type.DOUBLE, + _DECIMAL4: Type.DECIMAL, _DECIMAL8: Type.DECIMAL, _DECIMAL16: Type.DECIMAL, + _DATE: Type.DATE, + _TIMESTAMP: Type.TIMESTAMP, + _TIMESTAMP_NTZ: Type.TIMESTAMP_NTZ, + _FLOAT: Type.FLOAT, + _BINARY: Type.BINARY, + _LONG_STR: Type.STRING, + _UUID: Type.UUID, +} +_PRIMITIVE_FIXED_SIZES = { + _NULL: 1, _TRUE: 1, _FALSE: 1, + _INT1: 2, _INT2: 3, _INT4: 5, _INT8: 9, + _DOUBLE: 9, _FLOAT: 5, _DATE: 5, + _TIMESTAMP: 9, _TIMESTAMP_NTZ: 9, + _DECIMAL4: 6, _DECIMAL8: 10, _DECIMAL16: 18, + _UUID: 17, +} +_LONG_FAMILY_SIZES = { + _INT1: 1, _INT2: 2, _INT4: 4, _INT8: 8, + _DATE: 4, _TIMESTAMP: 8, _TIMESTAMP_NTZ: 8, +} + + # --------------------------------------------------------------------------- # Low-level binary utilities # --------------------------------------------------------------------------- @@ -171,21 +201,7 @@ def _get_type(value, pos): if basic_type == _ARRAY: return Type.ARRAY # PRIMITIVE - _MAP = { - _NULL: Type.NULL, - _TRUE: Type.BOOLEAN, _FALSE: Type.BOOLEAN, - _INT1: Type.LONG, _INT2: Type.LONG, _INT4: Type.LONG, _INT8: Type.LONG, - _DOUBLE: Type.DOUBLE, - _DECIMAL4: Type.DECIMAL, _DECIMAL8: Type.DECIMAL, _DECIMAL16: Type.DECIMAL, - _DATE: Type.DATE, - _TIMESTAMP: Type.TIMESTAMP, - _TIMESTAMP_NTZ: Type.TIMESTAMP_NTZ, - _FLOAT: Type.FLOAT, - _BINARY: Type.BINARY, - _LONG_STR: Type.STRING, - _UUID: Type.UUID, - } - t = _MAP.get(type_info) + t = _PRIMITIVE_TYPE_MAP.get(type_info) if t is None: raise ValueError(f'Unknown primitive variant type id: {type_info}') return t @@ -215,16 +231,9 @@ def _value_size(value, pos): ) ) # PRIMITIVE - _FIXED = { - _NULL: 1, _TRUE: 1, _FALSE: 1, - _INT1: 2, _INT2: 3, _INT4: 5, _INT8: 9, - _DOUBLE: 9, _FLOAT: 5, _DATE: 5, - _TIMESTAMP: 9, _TIMESTAMP_NTZ: 9, - _DECIMAL4: 6, _DECIMAL8: 10, _DECIMAL16: 18, - _UUID: 17, - } - if type_info in _FIXED: - return _FIXED[type_info] + size = _PRIMITIVE_FIXED_SIZES.get(type_info) + if size is not None: + return size if type_info in (_BINARY, _LONG_STR): return 1 + _U32_SIZE + _read_unsigned(value, pos + 1, _U32_SIZE) raise ValueError(f'Unknown primitive type id: {type_info}') @@ -478,6 +487,14 @@ def append_decimal(self, d): d = d.normalize() # Compute unscaled integer and scale sign, digits, exponent = d.as_tuple() + if exponent > 0: + # e.g. Decimal('1E+2') — the mantissa alone does not represent the true value. + # Callers should use append_double() for such values; _try_decimal_or_double + # handles this automatically when encoding from build_python(). + raise ValueError( + f'append_decimal requires a non-positive exponent (got {d!r}); ' + 'use append_double() for Decimal values with positive exponents' + ) unscaled = int(''.join(str(x) for x in digits)) if sign: unscaled = -unscaled @@ -802,9 +819,7 @@ def get_long(self) -> int: type_info = (b >> 2) & 0x3F if (b & 0x3) != _PRIMITIVE: raise TypeError('Expected integer/date/timestamp variant') - sizes = {_INT1: 1, _INT2: 2, _INT4: 4, _INT8: 8, - _DATE: 4, _TIMESTAMP: 8, _TIMESTAMP_NTZ: 8} - n = sizes.get(type_info) + n = _LONG_FAMILY_SIZES.get(type_info) if n is None: raise TypeError(f'Expected LONG-family variant, got type_info={type_info}') return _read_signed(self._value, self._pos + 1, n) diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index 3ec76e7c0ec7..debecf8d2564 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -454,7 +454,7 @@ def parse_data_field( ) -def _is_variant_struct(pa_type: pyarrow.StructType) -> bool: +def is_variant_struct(pa_type: pyarrow.StructType) -> bool: """Return True if *pa_type* is the two-field BINARY struct used to encode VARIANT. Paimon Java stores VARIANT as a Parquet GROUP with exactly two non-nullable @@ -624,7 +624,7 @@ def to_paimon_type(pa_type: pyarrow.DataType, nullable: bool) -> DataType: key_type = PyarrowFieldParser.to_paimon_type(pa_type.key_type, nullable) value_type = PyarrowFieldParser.to_paimon_type(pa_type.item_type, nullable) return MapType(nullable, key_type, value_type) - elif types.is_struct(pa_type) and _is_variant_struct(pa_type): + elif types.is_struct(pa_type) and is_variant_struct(pa_type): # Recognise the VARIANT encoding: a struct with exactly two non-nullable # BINARY fields named 'value' and 'metadata'. Must be checked before the # generic struct branch to avoid misclassifying it as a ROW type. diff --git a/paimon-python/pypaimon/table/row/generic_row.py b/paimon-python/pypaimon/table/row/generic_row.py index a25c8e8dd35d..4beeb99e42cd 100644 --- a/paimon-python/pypaimon/table/row/generic_row.py +++ b/paimon-python/pypaimon/table/row/generic_row.py @@ -143,8 +143,6 @@ def parse_field_value( return cls._parse_binary(bytes_data, base_offset, field_offset) elif type_name == 'BLOB': return cls._parse_blob(bytes_data, base_offset, field_offset) - elif type_name == 'VARIANT': - return cls._parse_variant(bytes_data, base_offset, field_offset) elif type_name.startswith('DECIMAL') or type_name.startswith('NUMERIC'): return cls._parse_decimal(bytes_data, base_offset, field_offset, data_type) elif type_name.startswith('TIMESTAMP'): @@ -227,23 +225,6 @@ def _parse_binary(cls, bytes_data: bytes, base_offset: int, field_offset: int) - length = (offset_and_len & cls.HIGHEST_SECOND_TO_EIGHTH_BIT) >> 56 return bytes_data[field_offset:field_offset + length] - @classmethod - def _parse_variant(cls, bytes_data: bytes, base_offset: int, field_offset: int) -> dict: - """Deserialize a VARIANT field from BinaryRow format. - - Returns a dict ``{'value': bytes, 'metadata': bytes}`` that mirrors the - PyArrow struct representation used by :meth:`PyarrowFieldParser.from_paimon_type`. - - Note: VARIANT is not a valid primary-key or partition-key type in Paimon, so - this path is only exercised when a VARIANT column appears in an internal - BinaryRow (e.g. a manifest entry), which is an unsupported configuration. - We read the raw binary payload and return a minimal metadata header so that - callers receive a structurally valid object rather than silently corrupt data. - """ - raw = cls._parse_binary(bytes_data, base_offset, field_offset) - # Minimal valid metadata: version=1 (0x01), zero dictionary entries (0x00). - return {'value': raw, 'metadata': b'\x01\x00'} - @classmethod def _parse_blob(cls, bytes_data: bytes, base_offset: int, field_offset: int) -> BlobData: """Parse BLOB data from binary format and return a BlobData instance.""" @@ -321,20 +302,11 @@ def to_bytes(cls, row: Union[GenericRow, BinaryRow]) -> bytes: type_name = field.type.type.upper() if any(type_name.startswith(p) for p in ['CHAR', 'VARCHAR', 'STRING', - 'BINARY', 'VARBINARY', 'BYTES', 'BLOB', - 'VARIANT']): + 'BINARY', 'VARBINARY', 'BYTES', 'BLOB']): if any(type_name.startswith(p) for p in ['CHAR', 'VARCHAR', 'STRING']): value_bytes = str(value).encode('utf-8') elif type_name == 'BLOB': value_bytes = value.to_data() - elif type_name == 'VARIANT': - # Serialize only the 'value' payload. VARIANT is not a valid - # primary-key or partition-key type, so BinaryRow serialization - # of VARIANT is only a safety net for unexpected code paths. - if isinstance(value, dict): - value_bytes = bytes(value.get('value', b'')) - else: - value_bytes = bytes(value) else: value_bytes = bytes(value) diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py index 02aa71b3be59..6aa1ec4cc10f 100644 --- a/paimon-python/pypaimon/tests/variant_test.py +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -44,7 +44,7 @@ DataTypeParser, PyarrowFieldParser, RowType, - _is_variant_struct, + is_variant_struct, ) from pypaimon.table.row.generic_row import GenericRowDeserializer, GenericRowSerializer @@ -162,47 +162,47 @@ def test_from_paimon_schema(self): # --------------------------------------------------------------------------- -# 3. Arrow type mapping — Arrow → Paimon (_is_variant_struct + to_paimon_type) +# 3. Arrow type mapping — Arrow → Paimon (is_variant_struct + to_paimon_type) # --------------------------------------------------------------------------- class TestVariantToPaimonType(unittest.TestCase): - def test_is_variant_struct_positive(self): - """_is_variant_struct recognises the canonical VARIANT struct.""" - self.assertTrue(_is_variant_struct(_variant_arrow_type())) + def testis_variant_struct_positive(self): + """is_variant_struct recognises the canonical VARIANT struct.""" + self.assertTrue(is_variant_struct(_variant_arrow_type())) - def test_is_variant_struct_wrong_names(self): + def testis_variant_struct_wrong_names(self): """A struct with wrong field names is NOT recognised as VARIANT.""" st = pa.struct([ pa.field('val', pa.binary(), nullable=False), pa.field('meta', pa.binary(), nullable=False), ]) - self.assertFalse(_is_variant_struct(st)) + self.assertFalse(is_variant_struct(st)) - def test_is_variant_struct_nullable_fields(self): + def testis_variant_struct_nullable_fields(self): """A struct with nullable fields is NOT recognised as VARIANT.""" st = pa.struct([ pa.field('value', pa.binary(), nullable=True), pa.field('metadata', pa.binary(), nullable=False), ]) - self.assertFalse(_is_variant_struct(st)) + self.assertFalse(is_variant_struct(st)) - def test_is_variant_struct_wrong_types(self): + def testis_variant_struct_wrong_types(self): """A struct with non-binary field types is NOT recognised as VARIANT.""" st = pa.struct([ pa.field('value', pa.string(), nullable=False), pa.field('metadata', pa.binary(), nullable=False), ]) - self.assertFalse(_is_variant_struct(st)) + self.assertFalse(is_variant_struct(st)) - def test_is_variant_struct_extra_fields(self): + def testis_variant_struct_extra_fields(self): """A struct with more than 2 fields (shredded variant) is NOT auto-recognised.""" st = pa.struct([ pa.field('value', pa.binary(), nullable=False), pa.field('metadata', pa.binary(), nullable=False), pa.field('typed_value', pa.int64(), nullable=True), ]) - self.assertFalse(_is_variant_struct(st)) + self.assertFalse(is_variant_struct(st)) def test_to_paimon_type_variant(self): """to_paimon_type converts the canonical VARIANT struct back to VARIANT.""" diff --git a/paimon-python/pypaimon/write/writer/data_blob_writer.py b/paimon-python/pypaimon/write/writer/data_blob_writer.py index 04742ac7b7c8..d170913bd917 100644 --- a/paimon-python/pypaimon/write/writer/data_blob_writer.py +++ b/paimon-python/pypaimon/write/writer/data_blob_writer.py @@ -28,7 +28,6 @@ from pypaimon.manifest.schema.data_file_meta import DataFileMeta from pypaimon.manifest.schema.simple_stats import SimpleStats from pypaimon.table.row.generic_row import GenericRow -from pypaimon.schema.data_types import _is_variant_struct from pypaimon.write.writer.data_writer import DataWriter logger = logging.getLogger(__name__) @@ -308,13 +307,7 @@ def _write_normal_data_to_file(self, data: pa.Table) -> Optional[DataFileMeta]: file_name = f"{CoreOptions.data_file_prefix(self.options)}{uuid.uuid4()}-0.{self.file_format}" file_path = self._generate_file_path(file_name) - # Reject VARIANT columns for ORC and Avro formats (matching Java behavior) - if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): - for field in data.schema: - if pa.types.is_struct(field.type) and _is_variant_struct(field.type): - raise NotImplementedError( - f"VARIANT type is not supported for {self.file_format} format" - ) + self._check_no_variant_for_format(data.schema) # Write file based on format if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: diff --git a/paimon-python/pypaimon/write/writer/data_writer.py b/paimon-python/pypaimon/write/writer/data_writer.py index a3f35600f14d..d6744471e537 100644 --- a/paimon-python/pypaimon/write/writer/data_writer.py +++ b/paimon-python/pypaimon/write/writer/data_writer.py @@ -26,7 +26,7 @@ from pypaimon.data.timestamp import Timestamp from pypaimon.manifest.schema.data_file_meta import DataFileMeta from pypaimon.manifest.schema.simple_stats import SimpleStats -from pypaimon.schema.data_types import PyarrowFieldParser, _is_variant_struct +from pypaimon.schema.data_types import PyarrowFieldParser, is_variant_struct from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.row.generic_row import GenericRow @@ -154,6 +154,15 @@ def _check_and_roll_if_needed(self): self._write_data_to_file(data_to_write) self.pending_data = remaining_data + def _check_no_variant_for_format(self, schema: pa.Schema): + """Raise NotImplementedError if any VARIANT column is present for an unsupported format.""" + if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): + for field in schema: + if pa.types.is_struct(field.type) and is_variant_struct(field.type): + raise NotImplementedError( + f"VARIANT type is not supported for {self.file_format} format" + ) + def _write_data_to_file(self, data: pa.Table): if data.num_rows == 0: return @@ -167,13 +176,7 @@ def _write_data_to_file(self, data: pa.Table): else: external_path_str = None - # Reject VARIANT columns for ORC and Avro formats (matching Java behavior) - if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): - for field in data.schema: - if pa.types.is_struct(field.type) and _is_variant_struct(field.type): - raise NotImplementedError( - f"VARIANT type is not supported for {self.file_format} format" - ) + self._check_no_variant_for_format(data.schema) if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: self.file_io.write_parquet(file_path, data, compression=self.compression, zstd_level=self.zstd_level) From cea5121ac65de752859b2d9d04bbd1d741a1e2da Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:36:06 +0800 Subject: [PATCH 03/10] support VARIANT for pypaimon --- .../java/org/apache/paimon/JavaPyE2ETest.java | 39 ++ paimon-python/dev/run_mixed_tests.sh | 44 +- .../tests/e2e/java_py_read_write_test.py | 65 +- .../pypaimon/tests/generic_variant_test.py | 405 ------------ paimon-python/pypaimon/tests/variant_test.py | 623 +++++++++++++----- 5 files changed, 534 insertions(+), 642 deletions(-) delete mode 100644 paimon-python/pypaimon/tests/generic_variant_test.py diff --git a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java index 66ad1d538d33..db7c0a911236 100644 --- a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java +++ b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java @@ -989,6 +989,45 @@ public void testVariantWrite() throws Exception { LOG.info("testVariantWrite: wrote and read back {} VARIANT rows", res.size()); } + /** Read a VARIANT-column table written by Python and verify content (Python→Java E2E). */ + @Test + @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") + public void testReadVariantTable() throws Exception { + Identifier identifier = identifier("py_variant_test"); + FileStoreTable table = (FileStoreTable) catalog.getTable(identifier); + List splits = + new ArrayList<>(table.newSnapshotReader().read().dataSplits()); + TableRead read = table.newRead(); + List res = + getResult(read, splits, row -> internalRowToString(row, table.rowType())); + assertThat(res).hasSize(4); + + // Verify the VARIANT column is present in the schema + assertThat(table.rowType().getFieldNames()).contains("payload"); + assertThat(table.rowType().getTypeAt(table.rowType().getFieldIndex("payload"))) + .isEqualTo(DataTypes.VARIANT()); + + // Verify each row's VARIANT payload can be decoded by Java + List splits2 = new ArrayList<>(table.newSnapshotReader().read().dataSplits()); + try (org.apache.paimon.reader.RecordReader reader = + read.createReader(splits2)) { + reader.forEachRemaining( + row -> { + int id = row.getInt(0); + if (id == 4) { + // null payload + assertThat(row.isNullAt(2)).isTrue(); + } else { + assertThat(row.isNullAt(2)).isFalse(); + org.apache.paimon.data.variant.Variant v = + row.getVariant(2); + assertThat(v).isNotNull(); + } + }); + } + LOG.info("testReadVariantTable: Java read {} VARIANT rows written by Python", res.size()); + } + /** Step 1: Write 5 base files for compact conflict test. */ @Test @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") diff --git a/paimon-python/dev/run_mixed_tests.sh b/paimon-python/dev/run_mixed_tests.sh index f277ca79e8c7..ef966f4dd2b5 100755 --- a/paimon-python/dev/run_mixed_tests.sh +++ b/paimon-python/dev/run_mixed_tests.sh @@ -362,16 +362,26 @@ run_variant_test() { fi } -run_py_variant_write_test() { - echo -e "${YELLOW}=== Running Python VARIANT Write+Read Test ===${NC}" +run_py_variant_write_java_read_test() { + echo -e "${YELLOW}=== Step 13: Running VARIANT Python-Write Java-Read Test ===${NC}" cd "$PAIMON_PYTHON_DIR" - echo "Running Python test for JavaPyReadWriteTest.test_py_write_read_variant_table..." - if python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_py_write_read_variant_table -v; then - echo -e "${GREEN}✓ Python VARIANT write+read test completed successfully${NC}" + echo "Running Python test for JavaPyReadWriteTest.test_py_write_variant_table..." + if ! python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_py_write_variant_table -v; then + echo -e "${RED}✗ Python VARIANT write test failed${NC}" + return 1 + fi + echo -e "${GREEN}✓ Python VARIANT write test completed successfully${NC}" + + echo "" + + cd "$PROJECT_ROOT" + echo "Running Maven test for JavaPyE2ETest.testReadVariantTable..." + if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testReadVariantTable -pl paimon-core -q -Drun.e2e.tests=true; then + echo -e "${GREEN}✓ Java VARIANT read test completed successfully${NC}" return 0 else - echo -e "${RED}✗ Python VARIANT write+read test failed${NC}" + echo -e "${RED}✗ Java VARIANT read test failed${NC}" return 1 fi } @@ -389,8 +399,8 @@ main() { local lumina_vector_result=0 local compact_conflict_result=0 local blob_alter_compact_result=0 - local variant_result=0 - local py_variant_write_result=0 + local java_variant_write_py_read_result=0 + local py_variant_write_java_read_result=0 # Detect Python version PYTHON_VERSION=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "unknown") @@ -489,14 +499,14 @@ main() { # Run VARIANT type test (Java write, Python read) if ! run_variant_test; then - variant_result=1 + java_variant_write_py_read_result=1 fi echo "" - # Run Python VARIANT write+read test (Python only, no Java needed) - if ! run_py_variant_write_test; then - py_variant_write_result=1 + # Run VARIANT Python-write Java-read test + if ! run_py_variant_write_java_read_test; then + py_variant_write_java_read_result=1 fi echo "" @@ -569,16 +579,16 @@ main() { echo -e "${RED}✗ Blob Alter+Compact Test (Java Write+Alter+Compact, Python Read): FAILED${NC}" fi - if [[ $variant_result -eq 0 ]]; then + if [[ $java_variant_write_py_read_result -eq 0 ]]; then echo -e "${GREEN}✓ VARIANT Type Test (Java Write, Python Read): PASSED${NC}" else echo -e "${RED}✗ VARIANT Type Test (Java Write, Python Read): FAILED${NC}" fi - if [[ $py_variant_write_result -eq 0 ]]; then - echo -e "${GREEN}✓ Python VARIANT Write+Read Test: PASSED${NC}" + if [[ $py_variant_write_java_read_result -eq 0 ]]; then + echo -e "${GREEN}✓ VARIANT Type Test (Python Write, Java Read): PASSED${NC}" else - echo -e "${RED}✗ Python VARIANT Write+Read Test: FAILED${NC}" + echo -e "${RED}✗ VARIANT Type Test (Python Write, Java Read): FAILED${NC}" fi echo "" @@ -586,7 +596,7 @@ main() { # Clean up warehouse directory after all tests cleanup_warehouse - if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 && $tantivy_fulltext_result -eq 0 && $lumina_vector_result -eq 0 && $compact_conflict_result -eq 0 && $blob_alter_compact_result -eq 0 && $variant_result -eq 0 && $py_variant_write_result -eq 0 ]]; then + if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 && $tantivy_fulltext_result -eq 0 && $lumina_vector_result -eq 0 && $compact_conflict_result -eq 0 && $blob_alter_compact_result -eq 0 && $java_variant_write_py_read_result -eq 0 && $py_variant_write_java_read_result -eq 0 ]]; then echo -e "${GREEN}🎉 All tests passed! Java-Python interoperability verified.${NC}" return 0 else diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index fd2da0c340ff..d12fe975b020 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -725,8 +725,15 @@ def test_read_variant_table(self): print(f"test_read_variant_table: verified {result.num_rows} VARIANT rows") - def test_py_write_read_variant_table(self): - """Python-only write+read test for VARIANT columns using GenericVariant.""" + def test_py_write_variant_table(self): + """Write a VARIANT-column table for Java to read back (Python→Java E2E). + + Data written: + id=1 payload={"name":"test","value":42} + id=2 payload=[10,20,30] + id=3 payload="hello" + id=4 payload=null + """ variant_type = pa.struct([ pa.field('value', pa.binary(), nullable=False), pa.field('metadata', pa.binary(), nullable=False), @@ -736,32 +743,24 @@ def test_py_write_read_variant_table(self): ('name', pa.string()), ('payload', variant_type), ]) - - schema = Schema.from_pyarrow_schema( - pa_schema, - options={'bucket': '-1'} - ) + schema = Schema.from_pyarrow_schema(pa_schema, options={'bucket': '-1'}) table_name = 'default.py_variant_test' self.catalog.create_table(table_name, schema, True) table = self.catalog.get_table(table_name) - # Construct GenericVariant objects - gv1 = GenericVariant.from_json('{"name":"test","value":42}') - gv2 = GenericVariant.from_json('[10,20,30]') - gv3 = GenericVariant.from_json('"hello"') - gv4 = GenericVariant.from_json('null') - - # Build the VARIANT column - variant_col = GenericVariant.to_arrow_array([gv1, gv2, gv3, gv4]) - + variant_col = GenericVariant.to_arrow_array([ + GenericVariant.from_json('{"name":"test","value":42}'), + GenericVariant.from_json('[10,20,30]'), + GenericVariant.from_json('"hello"'), + GenericVariant.from_json('null'), + ]) data = pa.table({ 'id': pa.array([1, 2, 3, 4], type=pa.int32()), 'name': pa.array(['row1', 'row2', 'row3', 'row4'], type=pa.string()), 'payload': variant_col, }, schema=pa_schema) - # Write write_builder = table.new_batch_write_builder() table_write = write_builder.new_write() table_commit = write_builder.new_commit() @@ -769,35 +768,5 @@ def test_py_write_read_variant_table(self): table_commit.commit(table_write.prepare_commit()) table_write.close() table_commit.close() + print(f"test_py_write_variant_table: wrote 4 VARIANT rows to {table_name}") - # Read back - read_builder = table.new_read_builder() - table_scan = read_builder.new_scan() - table_read = read_builder.new_read() - splits = table_scan.plan().splits() - result = table_read.to_arrow(splits) - - self.assertEqual(result.num_rows, 4) - - # Sort by id for deterministic assertion - result = table_sort_by(result, 'id') - payloads = result.column('payload').to_pylist() - - # Row 1: object - gv = GenericVariant.from_dict(payloads[0]) - self.assertEqual(gv.variant_get('$.name', 'string'), 'test') - self.assertEqual(gv.variant_get('$.value', 'int'), 42) - - # Row 2: array - gv = GenericVariant.from_dict(payloads[1]) - self.assertEqual(gv.to_python(), [10, 20, 30]) - - # Row 3: string - gv = GenericVariant.from_dict(payloads[2]) - self.assertEqual(gv.to_python(), 'hello') - - # Row 4: null - gv = GenericVariant.from_dict(payloads[3]) - self.assertIsNone(gv.to_python()) - - print(f"test_py_write_read_variant_table: verified {result.num_rows} rows") diff --git a/paimon-python/pypaimon/tests/generic_variant_test.py b/paimon-python/pypaimon/tests/generic_variant_test.py deleted file mode 100644 index 283ca44838e6..000000000000 --- a/paimon-python/pypaimon/tests/generic_variant_test.py +++ /dev/null @@ -1,405 +0,0 @@ -################################################################################ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -################################################################################ - -"""Tests for GenericVariant: binary encoding, decoding, to_json, and variant_get.""" - -import decimal -import json -import unittest - -from pypaimon.data.generic_variant import GenericVariant, Type - - -def _roundtrip(json_str): - """Build from JSON, decode back to JSON, and compare as normalised dicts/values.""" - v = GenericVariant.from_json(json_str) - return json.loads(v.to_json()) - - -class TestPrimitives(unittest.TestCase): - - def test_null(self): - v = GenericVariant.from_json('null') - self.assertEqual(v.get_type(), Type.NULL) - self.assertIsNone(v.to_python()) - self.assertEqual(v.to_json(), 'null') - - def test_true(self): - v = GenericVariant.from_json('true') - self.assertEqual(v.get_type(), Type.BOOLEAN) - self.assertTrue(v.get_boolean()) - self.assertEqual(v.to_json(), 'true') - - def test_false(self): - v = GenericVariant.from_json('false') - self.assertFalse(v.get_boolean()) - self.assertEqual(v.to_json(), 'false') - - def test_int_small(self): - v = GenericVariant.from_json('42') - self.assertEqual(v.get_type(), Type.LONG) - self.assertEqual(v.get_long(), 42) - self.assertEqual(v.to_json(), '42') - - def test_int_negative(self): - v = GenericVariant.from_json('-100') - self.assertEqual(v.get_long(), -100) - - def test_int_int2_boundary(self): - v = GenericVariant.from_json('1000') - self.assertEqual(v.get_long(), 1000) - - def test_int_int4_boundary(self): - v = GenericVariant.from_json('100000') - self.assertEqual(v.get_long(), 100000) - - def test_int_int8(self): - large = 2 ** 33 - v = GenericVariant.from_json(str(large)) - self.assertEqual(v.get_long(), large) - - def test_float_double(self): - v = GenericVariant.from_json('1.5') - # 1.5 has exact decimal representation so it may be encoded as DECIMAL or DOUBLE - py = v.to_python() - self.assertAlmostEqual(float(py), 1.5) - - def test_float_scientific(self): - v = GenericVariant.from_json('1.5e10') - self.assertEqual(v.get_type(), Type.DOUBLE) - self.assertAlmostEqual(v.get_double(), 1.5e10) - - def test_string_short(self): - v = GenericVariant.from_json('"hello"') - self.assertEqual(v.get_type(), Type.STRING) - self.assertEqual(v.get_string(), 'hello') - self.assertEqual(v.to_json(), '"hello"') - - def test_string_long(self): - long_str = 'x' * 100 # > MAX_SHORT_STR_SIZE (63) - v = GenericVariant.from_json(json.dumps(long_str)) - self.assertEqual(v.get_string(), long_str) - self.assertEqual(v.to_python(), long_str) - - def test_string_unicode(self): - v = GenericVariant.from_json('"北京"') - self.assertEqual(v.get_string(), '北京') - - def test_decimal_precision(self): - v = GenericVariant.from_json('100.99') - # should be encoded as DECIMAL, not DOUBLE - self.assertEqual(v.get_type(), Type.DECIMAL) - self.assertEqual(float(v.get_decimal()), 100.99) - - -class TestObject(unittest.TestCase): - - def _obj(self): - return GenericVariant.from_json('{"age":30,"city":"Beijing","active":true}') - - def test_type(self): - self.assertEqual(self._obj().get_type(), Type.OBJECT) - - def test_object_size(self): - self.assertEqual(self._obj().object_size(), 3) - - def test_get_field_by_key(self): - v = self._obj() - self.assertEqual(v.get_field_by_key('age').get_long(), 30) - self.assertEqual(v.get_field_by_key('city').get_string(), 'Beijing') - self.assertTrue(v.get_field_by_key('active').get_boolean()) - - def test_get_field_missing(self): - self.assertIsNone(self._obj().get_field_by_key('missing')) - - def test_get_field_at_index(self): - v = self._obj() - keys = {v.get_field_at_index(i)[0] for i in range(v.object_size())} - self.assertEqual(keys, {'age', 'city', 'active'}) - - def test_to_python(self): - result = self._obj().to_python() - self.assertIsInstance(result, dict) - self.assertEqual(result['age'], 30) - self.assertEqual(result['city'], 'Beijing') - self.assertTrue(result['active']) - - def test_to_json_roundtrip(self): - result = _roundtrip('{"age":30,"city":"Beijing","active":true}') - self.assertEqual(result, {'age': 30, 'city': 'Beijing', 'active': True}) - - def test_fields_sorted_alphabetically(self): - """Variant objects must store fields sorted by key name.""" - v = GenericVariant.from_json('{"z":1,"a":2,"m":3}') - keys = [v.get_field_at_index(i)[0] for i in range(v.object_size())] - self.assertEqual(keys, sorted(keys)) - - def test_nested_object(self): - v = GenericVariant.from_json('{"user":{"name":"Alice","age":25}}') - user = v.get_field_by_key('user') - self.assertEqual(user.get_field_by_key('name').get_string(), 'Alice') - self.assertEqual(user.get_field_by_key('age').get_long(), 25) - - -class TestArray(unittest.TestCase): - - def _arr(self): - return GenericVariant.from_json('[1,2,3]') - - def test_type(self): - self.assertEqual(self._arr().get_type(), Type.ARRAY) - - def test_array_size(self): - self.assertEqual(self._arr().array_size(), 3) - - def test_get_element_at_index(self): - v = self._arr() - self.assertEqual(v.get_element_at_index(0).get_long(), 1) - self.assertEqual(v.get_element_at_index(2).get_long(), 3) - - def test_out_of_bounds(self): - self.assertIsNone(self._arr().get_element_at_index(99)) - - def test_to_python(self): - self.assertEqual(self._arr().to_python(), [1, 2, 3]) - - def test_mixed_array(self): - v = GenericVariant.from_json('[1,"two",null,true]') - py = v.to_python() - self.assertEqual(py, [1, 'two', None, True]) - - def test_nested_array(self): - v = GenericVariant.from_json('[[1,2],[3,4]]') - self.assertEqual(v.get_element_at_index(0).to_python(), [1, 2]) - self.assertEqual(v.get_element_at_index(1).to_python(), [3, 4]) - - -class TestVariantGet(unittest.TestCase): - - def setUp(self): - self.v = GenericVariant.from_json( - '{"name":"Alice","age":30,"score":9.5,"active":true,' - '"address":{"city":"Beijing","zip":"100000"},' - '"tags":["python","data"],"balance":1234.56}' - ) - - def test_get_string(self): - self.assertEqual(self.v.variant_get('$.name', 'string'), 'Alice') - - def test_get_int(self): - self.assertEqual(self.v.variant_get('$.age', 'int'), 30) - - def test_get_long(self): - self.assertEqual(self.v.variant_get('$.age', 'long'), 30) - - def test_get_double(self): - self.assertAlmostEqual(self.v.variant_get('$.score', 'double'), 9.5, places=5) - - def test_get_boolean(self): - self.assertTrue(self.v.variant_get('$.active', 'boolean')) - - def test_nested_field(self): - self.assertEqual(self.v.variant_get('$.address.city', 'string'), 'Beijing') - - def test_array_index(self): - self.assertEqual(self.v.variant_get('$.tags[0]', 'string'), 'python') - self.assertEqual(self.v.variant_get('$.tags[1]', 'string'), 'data') - - def test_missing_path_returns_none(self): - self.assertIsNone(self.v.variant_get('$.nonexistent')) - - def test_type_mismatch_returns_none(self): - # $.tags is an array, not a primitive, cast to int should return None - self.assertIsNone(self.v.variant_get('$.tags', 'int')) - - def test_no_cast_returns_python_value(self): - result = self.v.variant_get('$.age') - self.assertEqual(result, 30) - - def test_root_dollar_only(self): - v = GenericVariant.from_json('42') - self.assertEqual(v.variant_get('$', 'int'), 42) - - def test_bracket_key_syntax(self): - self.assertEqual(self.v.variant_get("$['name']", 'string'), 'Alice') - - def test_decimal_cast(self): - result = self.v.variant_get('$.balance', 'decimal') - self.assertIsInstance(result, decimal.Decimal) - self.assertAlmostEqual(float(result), 1234.56, places=2) - - def test_string_cast_on_int(self): - result = self.v.variant_get('$.age', 'string') - # Should produce JSON representation of the integer - self.assertEqual(result, '30') - - -class TestFromDict(unittest.TestCase): - """Test constructing GenericVariant from PyArrow-style {'value': ..., 'metadata': ...}.""" - - def test_roundtrip_via_dict(self): - original = GenericVariant.from_json('{"x":1,"y":2}') - d = {'value': original.value(), 'metadata': original.metadata()} - restored = GenericVariant.from_dict(d) - self.assertEqual(restored.to_json(), original.to_json()) - - def test_from_dict_type(self): - original = GenericVariant.from_json('[1,2,3]') - restored = GenericVariant.from_dict( - {'value': original.value(), 'metadata': original.metadata()}) - self.assertEqual(restored.get_type(), Type.ARRAY) - self.assertEqual(restored.to_python(), [1, 2, 3]) - - -class TestFromPython(unittest.TestCase): - """Test GenericVariant.from_python().""" - - def test_from_python_dict(self): - v = GenericVariant.from_python({'a': 1, 'b': 'hello'}) - self.assertEqual(v.variant_get('$.a', 'int'), 1) - self.assertEqual(v.variant_get('$.b', 'string'), 'hello') - - def test_from_python_list(self): - v = GenericVariant.from_python([10, 20, 30]) - self.assertEqual(v.to_python(), [10, 20, 30]) - - def test_from_python_none(self): - v = GenericVariant.from_python(None) - self.assertIsNone(v.to_python()) - - def test_from_python_bytes(self): - v = GenericVariant.from_python(b'\x01\x02\x03') - self.assertEqual(v.get_type(), Type.BINARY) - self.assertEqual(v.get_binary(), b'\x01\x02\x03') - - -class TestToArrowArray(unittest.TestCase): - """Test GenericVariant.to_arrow_array().""" - - def test_basic(self): - import pyarrow as pa - gv1 = GenericVariant.from_json('{"a":1}') - gv2 = GenericVariant.from_json('[1,2]') - arr = GenericVariant.to_arrow_array([gv1, gv2]) - self.assertIsInstance(arr, pa.StructArray) - self.assertEqual(len(arr), 2) - # Roundtrip check - row0 = arr[0].as_py() - restored = GenericVariant.from_dict(row0) - self.assertEqual(restored.variant_get('$.a', 'int'), 1) - - def test_with_nulls(self): - arr = GenericVariant.to_arrow_array([GenericVariant.from_json('42'), None]) - self.assertEqual(len(arr), 2) - self.assertFalse(arr[0].is_valid is False) - self.assertTrue(arr[1].as_py() is None) - - def test_empty(self): - arr = GenericVariant.to_arrow_array([]) - self.assertEqual(len(arr), 0) - - -class TestJavaCompatibility(unittest.TestCase): - """Verify byte-level compatibility with Paimon Java's GenericVariant encoding. - - These test vectors were produced by calling GenericVariant.fromJson(json).value() - and GenericVariant.fromJson(json).metadata() in Java unit tests. - """ - - def test_null_encoding(self): - v = GenericVariant.from_json('null') - # Java null: value=[0x00], metadata=[0x01, 0x00, 0x00] - self.assertEqual(v.value(), bytes([0x00])) - - def test_true_encoding(self): - v = GenericVariant.from_json('true') - # Java true: value=[0x08] (type_info=TRUE=1, PRIMITIVE=0 → header=(1<<2)|0=0x04... wait - # Actually: TRUE=1, so header = (1 << 2) | PRIMITIVE(0) = 0x04? No… - # _primitive_header(TRUE) = (TRUE << 2) | PRIMITIVE = (1 << 2) | 0 = 0x04 - self.assertEqual(v.value()[0], (_TRUE << 2) | _PRIMITIVE) - self.assertTrue(v.get_boolean()) - - def test_int1_encoding(self): - v = GenericVariant.from_json('1') - # INT1=3 → header=(3<<2)|0=0x0C, then value byte 0x01 - self.assertEqual(v.value()[0], (_INT1 << 2) | _PRIMITIVE) - self.assertEqual(v.value()[1], 1) - - def test_string_short_encoding(self): - v = GenericVariant.from_json('"hi"') - # SHORT_STR=1, len=2 → header=(2<<2)|1=0x09 - self.assertEqual(v.value()[0], (2 << 2) | _SHORT_STR) - self.assertEqual(v.value()[1:3], b'hi') - - def test_object_field_order(self): - """Objects must store fields sorted alphabetically by key.""" - v = GenericVariant.from_json('{"z":1,"a":2}') - # field at index 0 should be 'a' (alphabetically first) - key0, child0 = v.get_field_at_index(0) - self.assertEqual(key0, 'a') - self.assertEqual(child0.get_long(), 2) - - def test_empty_object(self): - v = GenericVariant.from_json('{}') - self.assertEqual(v.get_type(), Type.OBJECT) - self.assertEqual(v.object_size(), 0) - self.assertEqual(v.to_python(), {}) - - def test_empty_array(self): - v = GenericVariant.from_json('[]') - self.assertEqual(v.get_type(), Type.ARRAY) - self.assertEqual(v.array_size(), 0) - self.assertEqual(v.to_python(), []) - - -class TestComplexRoundtrip(unittest.TestCase): - - def _check(self, json_str): - result = _roundtrip(json_str) - expected = json.loads(json_str) - self.assertEqual(result, expected) - - def test_nested_object_array(self): - self._check('{"users":[{"name":"Alice","age":30},{"name":"Bob","age":25}]}') - - def test_deep_nesting(self): - self._check('{"a":{"b":{"c":{"d":42}}}}') - - def test_array_of_objects(self): - self._check('[{"x":1},{"x":2},{"x":3}]') - - def test_all_primitive_types(self): - self._check('{"n":null,"b":true,"i":42,"s":"hello","f":1.5}') - - def test_large_object(self): - """Object with more than BINARY_SEARCH_THRESHOLD fields.""" - obj = {f'key{i:03d}': i for i in range(50)} - json_str = json.dumps(obj) - v = GenericVariant.from_json(json_str) - # Verify a few fields - self.assertEqual(v.get_field_by_key('key000').get_long(), 0) - self.assertEqual(v.get_field_by_key('key049').get_long(), 49) - - -# Import for Java encoding constants check -from pypaimon.data.generic_variant import _TRUE, _INT1, _SHORT_STR, _PRIMITIVE # noqa: E402 - - -if __name__ == '__main__': - unittest.main() diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py index 6aa1ec4cc10f..e66d77c8c162 100644 --- a/paimon-python/pypaimon/tests/variant_test.py +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -9,35 +9,38 @@ # # http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ################################################################################ -"""Tests for VARIANT data type support in pypaimon. +"""Tests for VARIANT type support in pypaimon. -VARIANT is stored in Parquet as a struct with two non-nullable BINARY fields:: - - required group { - required binary value; // encoded variant payload - required binary metadata; // key-dictionary for object field names - } - -PyArrow reads this group transparently as ``pa.struct``; no special reader is -needed. These tests verify the schema-mapping round-trip and the Parquet -read/write cycle. +Covers two layers: + 1. Type-system layer – schema parsing, Paimon↔Arrow type mapping, Parquet I/O. + 2. Encoding layer – GenericVariant binary encoding/decoding, to_json, variant_get. """ +import decimal import io +import json +import struct import tempfile import unittest import pyarrow as pa import pyarrow.parquet as pq +from pypaimon.data.generic_variant import ( + GenericVariant, + Type, + _PRIMITIVE, + _SHORT_STR, + _TRUE, + _INT1, +) from pypaimon.schema.data_types import ( AtomicType, DataField, @@ -62,18 +65,7 @@ def _variant_arrow_type() -> pa.StructType: def _make_variant_bytes(json_str: str) -> bytes: - """Produce a minimal Paimon-compatible VARIANT value payload. - - This is not a full Variant binary-spec encoder; it encodes the JSON string - as a UTF-8 string primitive (type byte 0x15 = string) so that the bytes - are structurally valid and round-trip as the same raw bytes. - - Encoding layout (Paimon/Parquet Variant spec v1): - - header byte: 0x15 (primitive, type=string) - - 4-byte little-endian length - - UTF-8 string bytes - """ - import struct + """Produce a minimal VARIANT value payload encoding a UTF-8 string primitive.""" payload = json_str.encode('utf-8') return struct.pack(' bytes: return b'\x01\x00' -# --------------------------------------------------------------------------- +def _roundtrip(json_str): + """Build a GenericVariant from JSON, decode back to JSON, parse with stdlib json.""" + v = GenericVariant.from_json(json_str) + return json.loads(v.to_json()) + + +# =========================================================================== # 1. Schema parsing -# --------------------------------------------------------------------------- +# =========================================================================== class TestVariantSchemaParsing(unittest.TestCase): def test_parse_variant_keyword(self): - """DataTypeParser accepts the VARIANT keyword.""" dt = DataTypeParser.parse_atomic_type_sql_string('VARIANT') self.assertIsInstance(dt, AtomicType) self.assertEqual(dt.type, 'VARIANT') self.assertTrue(dt.nullable) def test_parse_variant_not_null(self): - """DataTypeParser accepts VARIANT NOT NULL.""" dt = DataTypeParser.parse_atomic_type_sql_string('VARIANT NOT NULL') self.assertIsInstance(dt, AtomicType) self.assertFalse(dt.nullable) def test_variant_to_dict_roundtrip(self): - """AtomicType('VARIANT') survives a to_dict / from_dict round-trip.""" dt = AtomicType('VARIANT') - serialised = dt.to_dict() - restored = DataTypeParser.parse_data_type(serialised) + restored = DataTypeParser.parse_data_type(dt.to_dict()) self.assertEqual(dt, restored) def test_variant_str(self): - """str() representation is 'VARIANT'.""" self.assertEqual(str(AtomicType('VARIANT')), 'VARIANT') self.assertEqual(str(AtomicType('VARIANT', nullable=False)), 'VARIANT NOT NULL') -# --------------------------------------------------------------------------- -# 2. Arrow type mapping — Paimon → Arrow -# --------------------------------------------------------------------------- +# =========================================================================== +# 2. Arrow type mapping – Paimon → Arrow +# =========================================================================== class TestVariantFromPaimonType(unittest.TestCase): - def test_from_paimon_type_returns_struct(self): - """VARIANT maps to a two-field BINARY struct.""" - arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) - self.assertTrue(pa.types.is_struct(arrow_type)) - self.assertEqual(arrow_type.num_fields, 2) + def _arrow_type(self): + return PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) + + def test_returns_struct(self): + self.assertTrue(pa.types.is_struct(self._arrow_type())) + self.assertEqual(self._arrow_type().num_fields, 2) - def test_struct_field_names(self): - arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) - self.assertEqual(arrow_type.field(0).name, 'value') - self.assertEqual(arrow_type.field(1).name, 'metadata') + def test_field_names(self): + t = self._arrow_type() + self.assertEqual(t.field(0).name, 'value') + self.assertEqual(t.field(1).name, 'metadata') - def test_struct_field_types(self): - arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) - self.assertTrue(pa.types.is_binary(arrow_type.field(0).type)) - self.assertTrue(pa.types.is_binary(arrow_type.field(1).type)) + def test_field_types_are_binary(self): + t = self._arrow_type() + self.assertTrue(pa.types.is_binary(t.field(0).type)) + self.assertTrue(pa.types.is_binary(t.field(1).type)) - def test_struct_fields_not_nullable(self): - arrow_type = PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) - self.assertFalse(arrow_type.field(0).nullable) - self.assertFalse(arrow_type.field(1).nullable) + def test_fields_not_nullable(self): + t = self._arrow_type() + self.assertFalse(t.field(0).nullable) + self.assertFalse(t.field(1).nullable) def test_from_paimon_field(self): - """from_paimon_field wraps the type in a pa.Field with correct nullability.""" df = DataField(id=0, name='payload', type=AtomicType('VARIANT')) pa_field = PyarrowFieldParser.from_paimon_field(df) self.assertEqual(pa_field.name, 'payload') self.assertTrue(pa.types.is_struct(pa_field.type)) - # The outer field is nullable (VARIANT default is nullable) self.assertTrue(pa_field.nullable) def test_from_paimon_schema(self): - """from_paimon_schema produces correct Arrow schema for a mixed table.""" fields = [ DataField(id=0, name='id', type=AtomicType('BIGINT')), DataField(id=1, name='payload', type=AtomicType('VARIANT')), @@ -161,42 +152,37 @@ def test_from_paimon_schema(self): self.assertEqual(schema.field('payload').type, _variant_arrow_type()) -# --------------------------------------------------------------------------- -# 3. Arrow type mapping — Arrow → Paimon (is_variant_struct + to_paimon_type) -# --------------------------------------------------------------------------- +# =========================================================================== +# 3. Arrow type mapping – Arrow → Paimon (is_variant_struct + to_paimon_type) +# =========================================================================== class TestVariantToPaimonType(unittest.TestCase): - def testis_variant_struct_positive(self): - """is_variant_struct recognises the canonical VARIANT struct.""" + def test_is_variant_struct_positive(self): self.assertTrue(is_variant_struct(_variant_arrow_type())) - def testis_variant_struct_wrong_names(self): - """A struct with wrong field names is NOT recognised as VARIANT.""" + def test_is_variant_struct_wrong_names(self): st = pa.struct([ pa.field('val', pa.binary(), nullable=False), pa.field('meta', pa.binary(), nullable=False), ]) self.assertFalse(is_variant_struct(st)) - def testis_variant_struct_nullable_fields(self): - """A struct with nullable fields is NOT recognised as VARIANT.""" + def test_is_variant_struct_nullable_fields(self): st = pa.struct([ pa.field('value', pa.binary(), nullable=True), pa.field('metadata', pa.binary(), nullable=False), ]) self.assertFalse(is_variant_struct(st)) - def testis_variant_struct_wrong_types(self): - """A struct with non-binary field types is NOT recognised as VARIANT.""" + def test_is_variant_struct_wrong_types(self): st = pa.struct([ pa.field('value', pa.string(), nullable=False), pa.field('metadata', pa.binary(), nullable=False), ]) self.assertFalse(is_variant_struct(st)) - def testis_variant_struct_extra_fields(self): - """A struct with more than 2 fields (shredded variant) is NOT auto-recognised.""" + def test_is_variant_struct_extra_fields(self): st = pa.struct([ pa.field('value', pa.binary(), nullable=False), pa.field('metadata', pa.binary(), nullable=False), @@ -205,7 +191,6 @@ def testis_variant_struct_extra_fields(self): self.assertFalse(is_variant_struct(st)) def test_to_paimon_type_variant(self): - """to_paimon_type converts the canonical VARIANT struct back to VARIANT.""" result = PyarrowFieldParser.to_paimon_type(_variant_arrow_type(), nullable=True) self.assertIsInstance(result, AtomicType) self.assertEqual(result.type, 'VARIANT') @@ -215,17 +200,12 @@ def test_to_paimon_type_variant_not_null(self): result = PyarrowFieldParser.to_paimon_type(_variant_arrow_type(), nullable=False) self.assertFalse(result.nullable) - def test_ordinary_struct_not_confused_with_variant(self): - """A normal ROW struct with non-VARIANT fields maps to RowType, not VARIANT.""" - st = pa.struct([ - pa.field('a', pa.int32()), - pa.field('b', pa.string()), - ]) + def test_ordinary_struct_maps_to_row_type(self): + st = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string())]) result = PyarrowFieldParser.to_paimon_type(st, nullable=True) self.assertIsInstance(result, RowType) - def test_struct_same_names_but_different_types_is_rowtype(self): - """A struct named value/metadata but with non-binary types maps to RowType.""" + def test_struct_same_names_different_types_is_row_type(self): st = pa.struct([ pa.field('value', pa.string(), nullable=False), pa.field('metadata', pa.string(), nullable=False), @@ -234,113 +214,85 @@ def test_struct_same_names_but_different_types_is_rowtype(self): self.assertIsInstance(result, RowType) -# --------------------------------------------------------------------------- +# =========================================================================== # 4. Full schema round-trip -# --------------------------------------------------------------------------- +# =========================================================================== class TestVariantSchemaRoundTrip(unittest.TestCase): def test_paimon_to_arrow_to_paimon(self): - """VARIANT field survives a full Paimon → Arrow → Paimon round-trip.""" original = DataField(id=0, name='v', type=AtomicType('VARIANT')) pa_field = PyarrowFieldParser.from_paimon_field(original) - restored_type = PyarrowFieldParser.to_paimon_type(pa_field.type, pa_field.nullable) - self.assertIsInstance(restored_type, AtomicType) - self.assertEqual(restored_type.type, 'VARIANT') + restored = PyarrowFieldParser.to_paimon_type(pa_field.type, pa_field.nullable) + self.assertIsInstance(restored, AtomicType) + self.assertEqual(restored.type, 'VARIANT') def test_mixed_schema_round_trip(self): - """A table schema with VARIANT alongside other types round-trips correctly.""" - original_fields = [ + fields = [ DataField(id=0, name='id', type=AtomicType('BIGINT')), DataField(id=1, name='payload', type=AtomicType('VARIANT')), DataField(id=2, name='ts', type=AtomicType('TIMESTAMP(6)')), ] - pa_schema = PyarrowFieldParser.from_paimon_schema(original_fields) - restored_fields = PyarrowFieldParser.to_paimon_schema(pa_schema) + pa_schema = PyarrowFieldParser.from_paimon_schema(fields) + restored = PyarrowFieldParser.to_paimon_schema(pa_schema) - self.assertEqual(restored_fields[0].name, 'id') - self.assertEqual(restored_fields[1].name, 'payload') - self.assertIsInstance(restored_fields[1].type, AtomicType) - self.assertEqual(restored_fields[1].type.type, 'VARIANT') - self.assertEqual(restored_fields[2].name, 'ts') + self.assertEqual(restored[1].name, 'payload') + self.assertIsInstance(restored[1].type, AtomicType) + self.assertEqual(restored[1].type.type, 'VARIANT') + self.assertEqual(restored[2].name, 'ts') -# --------------------------------------------------------------------------- +# =========================================================================== # 5. Parquet read/write cycle -# --------------------------------------------------------------------------- +# =========================================================================== class TestVariantParquetCycle(unittest.TestCase): - """Verify that VARIANT columns survive a Parquet write → read cycle. - - PyArrow writes the struct-of-binary as a Parquet GROUP, which matches the - layout produced by Paimon Java. On read, PyArrow reconstructs the struct - transparently — no custom reader is required. - """ def _make_table(self) -> pa.Table: schema = pa.schema([ pa.field('id', pa.int64()), pa.field('payload', _variant_arrow_type()), ]) - value1 = _make_variant_bytes('{"key": "hello"}') - value2 = _make_variant_bytes('42') meta = _make_metadata() payload_col = pa.array( - [{'value': value1, 'metadata': meta}, - {'value': value2, 'metadata': meta}], + [{'value': _make_variant_bytes('{"key":"hello"}'), 'metadata': meta}, + {'value': _make_variant_bytes('42'), 'metadata': meta}], type=_variant_arrow_type(), ) - return pa.table( - {'id': pa.array([1, 2], type=pa.int64()), 'payload': payload_col}, - schema=schema, - ) + return pa.table({'id': [1, 2], 'payload': payload_col}, schema=schema) def test_write_and_read_parquet(self): - """VARIANT struct column survives Parquet write → read.""" original = self._make_table() buf = io.BytesIO() pq.write_table(original, buf) buf.seek(0) restored = pq.read_table(buf) - self.assertEqual(restored.schema.field('payload').type, _variant_arrow_type()) self.assertEqual(restored.num_rows, 2) def test_variant_values_preserved(self): - """The raw value and metadata bytes are preserved across Parquet round-trip.""" original = self._make_table() buf = io.BytesIO() pq.write_table(original, buf) buf.seek(0) - restored = pq.read_table(buf) - - payload_col = restored.column('payload') - row0 = payload_col[0].as_py() - self.assertIn('value', row0) - self.assertIn('metadata', row0) - self.assertEqual(row0['value'], _make_variant_bytes('{"key": "hello"}')) + row0 = pq.read_table(buf).column('payload')[0].as_py() + self.assertEqual(row0['value'], _make_variant_bytes('{"key":"hello"}')) self.assertEqual(row0['metadata'], _make_metadata()) def test_null_variant_row(self): - """A NULL VARIANT value is handled correctly.""" - schema = pa.schema([ - pa.field('id', pa.int64()), - pa.field('payload', _variant_arrow_type()), - ]) + schema = pa.schema([pa.field('id', pa.int64()), pa.field('payload', _variant_arrow_type())]) payload_col = pa.array( [None, {'value': _make_variant_bytes('true'), 'metadata': _make_metadata()}], type=_variant_arrow_type(), ) - table = pa.table({'id': [1, 2], 'payload': payload_col}, schema=schema) buf = io.BytesIO() - pq.write_table(table, buf) + pq.write_table(pa.table({'id': [1, 2], 'payload': payload_col}, schema=schema), buf) buf.seek(0) restored = pq.read_table(buf) self.assertIsNone(restored.column('payload')[0].as_py()) self.assertIsNotNone(restored.column('payload')[1].as_py()) def test_write_to_file(self): - """VARIANT table can be written to and read from a real file path.""" original = self._make_table() with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as f: path = f.name @@ -351,58 +303,385 @@ def test_write_to_file(self): os.unlink(path) -# --------------------------------------------------------------------------- -# 6. BinaryRow serializer / deserializer (safety-net paths) -# --------------------------------------------------------------------------- +# =========================================================================== +# 6. GenericVariant – primitive types +# =========================================================================== + +class TestPrimitives(unittest.TestCase): + + def test_null(self): + v = GenericVariant.from_json('null') + self.assertEqual(v.get_type(), Type.NULL) + self.assertIsNone(v.to_python()) + self.assertEqual(v.to_json(), 'null') + + def test_true(self): + v = GenericVariant.from_json('true') + self.assertEqual(v.get_type(), Type.BOOLEAN) + self.assertTrue(v.get_boolean()) + self.assertEqual(v.to_json(), 'true') + + def test_false(self): + v = GenericVariant.from_json('false') + self.assertFalse(v.get_boolean()) + self.assertEqual(v.to_json(), 'false') + + def test_int_small(self): + v = GenericVariant.from_json('42') + self.assertEqual(v.get_type(), Type.LONG) + self.assertEqual(v.get_long(), 42) + + def test_int_negative(self): + self.assertEqual(GenericVariant.from_json('-100').get_long(), -100) + + def test_int_int2_boundary(self): + self.assertEqual(GenericVariant.from_json('1000').get_long(), 1000) + + def test_int_int4_boundary(self): + self.assertEqual(GenericVariant.from_json('100000').get_long(), 100000) + + def test_int_int8(self): + large = 2 ** 33 + self.assertEqual(GenericVariant.from_json(str(large)).get_long(), large) + + def test_float_double(self): + self.assertAlmostEqual(float(GenericVariant.from_json('1.5').to_python()), 1.5) + + def test_float_scientific(self): + v = GenericVariant.from_json('1.5e10') + self.assertEqual(v.get_type(), Type.DOUBLE) + self.assertAlmostEqual(v.get_double(), 1.5e10) + + def test_string_short(self): + v = GenericVariant.from_json('"hello"') + self.assertEqual(v.get_type(), Type.STRING) + self.assertEqual(v.get_string(), 'hello') + + def test_string_long(self): + long_str = 'x' * 100 + v = GenericVariant.from_json(json.dumps(long_str)) + self.assertEqual(v.get_string(), long_str) + + def test_string_unicode(self): + self.assertEqual(GenericVariant.from_json('"北京"').get_string(), '北京') + + def test_decimal_precision(self): + v = GenericVariant.from_json('100.99') + self.assertEqual(v.get_type(), Type.DECIMAL) + self.assertAlmostEqual(float(v.get_decimal()), 100.99) + + +# =========================================================================== +# 7. GenericVariant – objects +# =========================================================================== + +class TestObject(unittest.TestCase): + + def _obj(self): + return GenericVariant.from_json('{"age":30,"city":"Beijing","active":true}') + + def test_type(self): + self.assertEqual(self._obj().get_type(), Type.OBJECT) + + def test_object_size(self): + self.assertEqual(self._obj().object_size(), 3) + + def test_get_field_by_key(self): + v = self._obj() + self.assertEqual(v.get_field_by_key('age').get_long(), 30) + self.assertEqual(v.get_field_by_key('city').get_string(), 'Beijing') + self.assertTrue(v.get_field_by_key('active').get_boolean()) + + def test_get_field_missing(self): + self.assertIsNone(self._obj().get_field_by_key('missing')) + + def test_fields_sorted_alphabetically(self): + v = GenericVariant.from_json('{"z":1,"a":2,"m":3}') + keys = [v.get_field_at_index(i)[0] for i in range(v.object_size())] + self.assertEqual(keys, sorted(keys)) + + def test_to_python(self): + result = self._obj().to_python() + self.assertEqual(result, {'age': 30, 'city': 'Beijing', 'active': True}) + + def test_to_json_roundtrip(self): + self.assertEqual( + _roundtrip('{"age":30,"city":"Beijing","active":true}'), + {'age': 30, 'city': 'Beijing', 'active': True} + ) + + def test_nested_object(self): + v = GenericVariant.from_json('{"user":{"name":"Alice","age":25}}') + user = v.get_field_by_key('user') + self.assertEqual(user.get_field_by_key('name').get_string(), 'Alice') + self.assertEqual(user.get_field_by_key('age').get_long(), 25) + + def test_empty_object(self): + v = GenericVariant.from_json('{}') + self.assertEqual(v.get_type(), Type.OBJECT) + self.assertEqual(v.object_size(), 0) + self.assertEqual(v.to_python(), {}) + + def test_large_object_binary_search(self): + """Objects with >32 fields use binary search; verify correctness.""" + obj = {f'key{i:03d}': i for i in range(50)} + v = GenericVariant.from_json(json.dumps(obj)) + self.assertEqual(v.get_field_by_key('key000').get_long(), 0) + self.assertEqual(v.get_field_by_key('key049').get_long(), 49) + + +# =========================================================================== +# 8. GenericVariant – arrays +# =========================================================================== + +class TestArray(unittest.TestCase): + + def _arr(self): + return GenericVariant.from_json('[1,2,3]') + + def test_type(self): + self.assertEqual(self._arr().get_type(), Type.ARRAY) + + def test_array_size(self): + self.assertEqual(self._arr().array_size(), 3) + + def test_get_element_at_index(self): + v = self._arr() + self.assertEqual(v.get_element_at_index(0).get_long(), 1) + self.assertEqual(v.get_element_at_index(2).get_long(), 3) + + def test_out_of_bounds(self): + self.assertIsNone(self._arr().get_element_at_index(99)) + + def test_to_python(self): + self.assertEqual(self._arr().to_python(), [1, 2, 3]) + + def test_mixed_array(self): + self.assertEqual( + GenericVariant.from_json('[1,"two",null,true]').to_python(), + [1, 'two', None, True] + ) + + def test_nested_array(self): + v = GenericVariant.from_json('[[1,2],[3,4]]') + self.assertEqual(v.get_element_at_index(0).to_python(), [1, 2]) + self.assertEqual(v.get_element_at_index(1).to_python(), [3, 4]) + + def test_empty_array(self): + v = GenericVariant.from_json('[]') + self.assertEqual(v.get_type(), Type.ARRAY) + self.assertEqual(v.array_size(), 0) + self.assertEqual(v.to_python(), []) + + +# =========================================================================== +# 9. GenericVariant – variant_get (JSONPath extraction + cast) +# =========================================================================== + +class TestVariantGet(unittest.TestCase): + + def setUp(self): + self.v = GenericVariant.from_json( + '{"name":"Alice","age":30,"score":9.5,"active":true,' + '"address":{"city":"Beijing","zip":"100000"},' + '"tags":["python","data"],"balance":1234.56}' + ) + + def test_get_string(self): + self.assertEqual(self.v.variant_get('$.name', 'string'), 'Alice') + + def test_get_int(self): + self.assertEqual(self.v.variant_get('$.age', 'int'), 30) + + def test_get_long(self): + self.assertEqual(self.v.variant_get('$.age', 'long'), 30) + + def test_get_double(self): + self.assertAlmostEqual(self.v.variant_get('$.score', 'double'), 9.5, places=5) + + def test_get_boolean(self): + self.assertTrue(self.v.variant_get('$.active', 'boolean')) + + def test_nested_field(self): + self.assertEqual(self.v.variant_get('$.address.city', 'string'), 'Beijing') + + def test_array_index(self): + self.assertEqual(self.v.variant_get('$.tags[0]', 'string'), 'python') + self.assertEqual(self.v.variant_get('$.tags[1]', 'string'), 'data') + + def test_missing_path_returns_none(self): + self.assertIsNone(self.v.variant_get('$.nonexistent')) + + def test_type_mismatch_returns_none(self): + self.assertIsNone(self.v.variant_get('$.tags', 'int')) + + def test_no_cast_returns_python_value(self): + self.assertEqual(self.v.variant_get('$.age'), 30) + + def test_root_dollar_only(self): + self.assertEqual(GenericVariant.from_json('42').variant_get('$', 'int'), 42) + + def test_bracket_key_syntax(self): + self.assertEqual(self.v.variant_get("$['name']", 'string'), 'Alice') + + def test_decimal_cast(self): + result = self.v.variant_get('$.balance', 'decimal') + self.assertIsInstance(result, decimal.Decimal) + self.assertAlmostEqual(float(result), 1234.56, places=2) + + def test_string_cast_on_int(self): + self.assertEqual(self.v.variant_get('$.age', 'string'), '30') + + +# =========================================================================== +# 10. GenericVariant – constructors +# =========================================================================== + +class TestConstructors(unittest.TestCase): + + def test_from_dict_roundtrip(self): + original = GenericVariant.from_json('{"x":1,"y":2}') + restored = GenericVariant.from_dict({'value': original.value(), 'metadata': original.metadata()}) + self.assertEqual(restored.to_json(), original.to_json()) + + def test_from_dict_array(self): + original = GenericVariant.from_json('[1,2,3]') + restored = GenericVariant.from_dict({'value': original.value(), 'metadata': original.metadata()}) + self.assertEqual(restored.get_type(), Type.ARRAY) + self.assertEqual(restored.to_python(), [1, 2, 3]) + + def test_from_python_dict(self): + v = GenericVariant.from_python({'a': 1, 'b': 'hello'}) + self.assertEqual(v.variant_get('$.a', 'int'), 1) + self.assertEqual(v.variant_get('$.b', 'string'), 'hello') + + def test_from_python_list(self): + self.assertEqual(GenericVariant.from_python([10, 20, 30]).to_python(), [10, 20, 30]) + + def test_from_python_none(self): + self.assertIsNone(GenericVariant.from_python(None).to_python()) + + def test_from_python_bytes(self): + v = GenericVariant.from_python(b'\x01\x02\x03') + self.assertEqual(v.get_type(), Type.BINARY) + self.assertEqual(v.get_binary(), b'\x01\x02\x03') + + +# =========================================================================== +# 11. GenericVariant – to_arrow_array +# =========================================================================== + +class TestToArrowArray(unittest.TestCase): + + def test_basic(self): + gv1 = GenericVariant.from_json('{"a":1}') + gv2 = GenericVariant.from_json('[1,2]') + arr = GenericVariant.to_arrow_array([gv1, gv2]) + self.assertIsInstance(arr, pa.StructArray) + self.assertEqual(len(arr), 2) + restored = GenericVariant.from_dict(arr[0].as_py()) + self.assertEqual(restored.variant_get('$.a', 'int'), 1) + + def test_with_nulls(self): + arr = GenericVariant.to_arrow_array([GenericVariant.from_json('42'), None]) + self.assertEqual(len(arr), 2) + self.assertIsNone(arr[1].as_py()) + + def test_empty(self): + self.assertEqual(len(GenericVariant.to_arrow_array([])), 0) + + +# =========================================================================== +# 12. Java byte-level encoding compatibility +# =========================================================================== + +class TestJavaCompatibility(unittest.TestCase): + """Verify byte-level compatibility with Paimon Java's GenericVariant encoding.""" + + def test_null_encoding(self): + self.assertEqual(GenericVariant.from_json('null').value(), bytes([0x00])) + + def test_true_encoding(self): + v = GenericVariant.from_json('true') + self.assertEqual(v.value()[0], (_TRUE << 2) | _PRIMITIVE) + self.assertTrue(v.get_boolean()) + + def test_int1_encoding(self): + v = GenericVariant.from_json('1') + self.assertEqual(v.value()[0], (_INT1 << 2) | _PRIMITIVE) + self.assertEqual(v.value()[1], 1) + + def test_string_short_encoding(self): + v = GenericVariant.from_json('"hi"') + self.assertEqual(v.value()[0], (2 << 2) | _SHORT_STR) + self.assertEqual(v.value()[1:3], b'hi') + + def test_object_field_order(self): + v = GenericVariant.from_json('{"z":1,"a":2}') + key0, child0 = v.get_field_at_index(0) + self.assertEqual(key0, 'a') + self.assertEqual(child0.get_long(), 2) + + +# =========================================================================== +# 13. Complex roundtrip +# =========================================================================== + +class TestComplexRoundtrip(unittest.TestCase): + + def _check(self, json_str): + self.assertEqual(_roundtrip(json_str), json.loads(json_str)) + + def test_nested_object_array(self): + self._check('{"users":[{"name":"Alice","age":30},{"name":"Bob","age":25}]}') + + def test_deep_nesting(self): + self._check('{"a":{"b":{"c":{"d":42}}}}') + + def test_array_of_objects(self): + self._check('[{"x":1},{"x":2},{"x":3}]') + + def test_all_primitive_types(self): + self._check('{"n":null,"b":true,"i":42,"s":"hello","f":1.5}') + + +# =========================================================================== +# 14. BinaryRow – VARIANT is unsupported (raises, not silently corrupts) +# =========================================================================== class TestVariantBinaryRow(unittest.TestCase): - """The BinaryRow path for VARIANT is a safety net; VARIANT is never a key. + """VARIANT is not a valid key/partition type; BinaryRow does not support it. - We verify that the code does not silently corrupt data or raise unexpected - errors. The deserializer returns {'value': bytes, 'metadata': bytes}; - the serializer encodes the value payload as a variable-length binary field. + Both the serializer and deserializer must raise rather than silently + return corrupt data. """ - def _make_field(self) -> DataField: + def _field(self): return DataField(id=0, name='v', type=AtomicType('VARIANT')) - def test_serialize_variant_dict(self): - """Serializing a VARIANT dict does not raise.""" - from pypaimon.table.row.generic_row import GenericRow - field = self._make_field() - value = {'value': b'\x15\x05hello', 'metadata': b'\x01\x00'} - row = GenericRow([value], [field]) - serialized = GenericRowSerializer.to_bytes(row) - self.assertIsInstance(serialized, bytes) - self.assertGreater(len(serialized), 0) - - def test_serialize_null_variant(self): - """A NULL VARIANT value serializes to the null-bit representation.""" + def test_deserialize_raises(self): + """Deserializing a VARIANT field from BinaryRow raises ValueError.""" from pypaimon.table.row.generic_row import GenericRow - field = self._make_field() - row = GenericRow([None], [field]) + # Build a BinaryRow that looks like it has a BINARY field at position 0. + # The exact bytes don't matter; the type dispatch must fail before reading. + field = self._field() + # Serialize a plain binary value via a different type to get a valid BinaryRow layout, + # then attempt to deserialize it as VARIANT. + binary_field = DataField(id=0, name='v', type=AtomicType('BYTES')) + row = GenericRow([b'\x00'], [binary_field]) serialized = GenericRowSerializer.to_bytes(row) - self.assertIsInstance(serialized, bytes) + with self.assertRaises(ValueError) as ctx: + GenericRowDeserializer.from_bytes(serialized, [field]) + self.assertIn('VARIANT', str(ctx.exception)) - def test_deserialize_produces_dict(self): - """Deserializing a serialized VARIANT row returns a dict with 'value' key.""" + def test_serialize_raises(self): + """Serializing a VARIANT field via BinaryRow raises TypeError.""" from pypaimon.table.row.generic_row import GenericRow - field = self._make_field() - value = {'value': b'\x15\x05hello', 'metadata': b'\x01\x00'} - row = GenericRow([value], [field]) - serialized = GenericRowSerializer.to_bytes(row) - restored = GenericRowDeserializer.from_bytes(serialized, [field]) - result = restored.values[0] - self.assertIsInstance(result, dict) - self.assertIn('value', result) - - def test_serialize_bytes_fallback(self): - """Serializing raw bytes (not a dict) as VARIANT does not raise.""" - from pypaimon.table.row.generic_row import GenericRow - field = self._make_field() - row = GenericRow([b'\x15\x05hello'], [field]) - serialized = GenericRowSerializer.to_bytes(row) - self.assertIsInstance(serialized, bytes) + field = self._field() + row = GenericRow([{'value': b'\x00', 'metadata': b'\x01\x00'}], [field]) + with self.assertRaises(TypeError) as ctx: + GenericRowSerializer.to_bytes(row) + self.assertIn('VARIANT', str(ctx.exception)) if __name__ == '__main__': From 46ba2461658e14d762a3bc3928f9f7e20aa98158 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 15:56:10 +0800 Subject: [PATCH 04/10] support VARIANT for pypaimon --- .../java/org/apache/paimon/JavaPyE2ETest.java | 12 ++--- paimon-python/dev/run_mixed_tests.sh | 22 +++++---- .../pypaimon/data/generic_variant.py | 49 +++++++++++++------ .../pypaimon/table/row/generic_row.py | 2 +- .../tests/e2e/java_py_read_write_test.py | 8 +-- paimon-python/pypaimon/tests/variant_test.py | 39 --------------- .../pypaimon/write/writer/data_writer.py | 11 ++++- 7 files changed, 67 insertions(+), 76 deletions(-) diff --git a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java index db7c0a911236..d0309969dc64 100644 --- a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java +++ b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java @@ -942,10 +942,10 @@ protected GenericRow createRow3ColsWithKind(RowKind rowKind, Object... values) { return GenericRow.ofKind(rowKind, values[0], values[1], values[2]); } - /** Write a VARIANT column table for Python interoperability test. */ + /** Java writes a VARIANT-column table for Python to read (Java→Python E2E). */ @Test @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") - public void testVariantWrite() throws Exception { + public void testJavaWriteVariantTable() throws Exception { Identifier identifier = identifier("variant_test"); catalog.dropTable(identifier, true); Schema schema = @@ -986,13 +986,13 @@ public void testVariantWrite() throws Exception { List res = getResult(read, splits, row -> internalRowToString(row, readTable.rowType())); assertThat(res).hasSize(3); - LOG.info("testVariantWrite: wrote and read back {} VARIANT rows", res.size()); + LOG.info("testJavaWriteVariantTable: wrote and read back {} VARIANT rows", res.size()); } - /** Read a VARIANT-column table written by Python and verify content (Python→Java E2E). */ + /** Java reads a VARIANT-column table written by Python (Python→Java E2E). */ @Test @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") - public void testReadVariantTable() throws Exception { + public void testJavaReadVariantTable() throws Exception { Identifier identifier = identifier("py_variant_test"); FileStoreTable table = (FileStoreTable) catalog.getTable(identifier); List splits = @@ -1025,7 +1025,7 @@ public void testReadVariantTable() throws Exception { } }); } - LOG.info("testReadVariantTable: Java read {} VARIANT rows written by Python", res.size()); + LOG.info("testJavaReadVariantTable: Java read {} VARIANT rows written by Python", res.size()); } /** Step 1: Write 5 base files for compact conflict test. */ diff --git a/paimon-python/dev/run_mixed_tests.sh b/paimon-python/dev/run_mixed_tests.sh index ef966f4dd2b5..2067d1813eab 100755 --- a/paimon-python/dev/run_mixed_tests.sh +++ b/paimon-python/dev/run_mixed_tests.sh @@ -339,21 +339,22 @@ run_blob_alter_compact_test() { fi } -run_variant_test() { - echo -e "${YELLOW}=== Running VARIANT Type Test (Java Write, Python Read) ===${NC}" +# Function to run VARIANT test (Java write, Python read) +run_java_variant_write_py_read_test() { + echo -e "${YELLOW}=== Running VARIANT Test (Java Write, Python Read) ===${NC}" cd "$PROJECT_ROOT" - echo "Running Maven test for JavaPyE2ETest.testVariantWrite..." - if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testVariantWrite -pl paimon-core -q -Drun.e2e.tests=true; then + echo "Running Maven test for JavaPyE2ETest.testJavaWriteVariantTable..." + if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testJavaWriteVariantTable -pl paimon-core -q -Drun.e2e.tests=true; then echo -e "${GREEN}✓ Java VARIANT write test completed successfully${NC}" else echo -e "${RED}✗ Java VARIANT write test failed${NC}" return 1 fi cd "$PAIMON_PYTHON_DIR" - echo "Running Python test for JavaPyReadWriteTest.test_read_variant_table..." - if python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_read_variant_table -v; then + echo "Running Python test for JavaPyReadWriteTest.test_py_read_variant_table..." + if python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_py_read_variant_table -v; then echo -e "${GREEN}✓ Python VARIANT read test completed successfully${NC}" return 0 else @@ -362,8 +363,9 @@ run_variant_test() { fi } +# Function to run VARIANT test (Python write, Java read) run_py_variant_write_java_read_test() { - echo -e "${YELLOW}=== Step 13: Running VARIANT Python-Write Java-Read Test ===${NC}" + echo -e "${YELLOW}=== Running VARIANT Test (Python Write, Java Read) ===${NC}" cd "$PAIMON_PYTHON_DIR" echo "Running Python test for JavaPyReadWriteTest.test_py_write_variant_table..." @@ -376,8 +378,8 @@ run_py_variant_write_java_read_test() { echo "" cd "$PROJECT_ROOT" - echo "Running Maven test for JavaPyE2ETest.testReadVariantTable..." - if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testReadVariantTable -pl paimon-core -q -Drun.e2e.tests=true; then + echo "Running Maven test for JavaPyE2ETest.testJavaReadVariantTable..." + if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testJavaReadVariantTable -pl paimon-core -q -Drun.e2e.tests=true; then echo -e "${GREEN}✓ Java VARIANT read test completed successfully${NC}" return 0 else @@ -498,7 +500,7 @@ main() { echo "" # Run VARIANT type test (Java write, Python read) - if ! run_variant_test; then + if ! run_java_variant_write_py_read_test; then java_variant_write_py_read_result=1 fi diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py index 7e7a9c3b5fc0..e46d6ba7114f 100644 --- a/paimon-python/pypaimon/data/generic_variant.py +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -31,6 +31,7 @@ v.to_python() – decode to native Python objects """ +import base64 import datetime import decimal as _decimal import enum @@ -86,7 +87,6 @@ _EPOCH_DT_NTZ = datetime.datetime(1970, 1, 1) - class Type(enum.Enum): """High-level variant value types (many-to-one from wire types).""" OBJECT = 'OBJECT' @@ -150,8 +150,7 @@ def _read_signed(data, pos, n): def _write_le(buf, pos, value, n): """Write value as n-byte little-endian into bytearray buf at pos.""" - for i in range(n): - buf[pos + i] = (value >> (8 * i)) & 0xFF + buf[pos:pos + n] = value.to_bytes(n, 'little') def _get_int_size(value): @@ -652,7 +651,7 @@ def _try_decimal_or_double(self, d): if scale <= _MAX_DECIMAL16_PRECISION and precision <= _MAX_DECIMAL16_PRECISION: self.append_decimal(d) return - except Exception: + except (ArithmeticError, ValueError): pass self.append_double(float(d)) @@ -890,12 +889,23 @@ def object_size(self) -> int: def get_field_by_key(self, key: str): """Return the field GenericVariant for the given key, or None if not found.""" + metadata = self._metadata + # Pre-parse the metadata header once for the entire lookup. + meta_offset_size = ((metadata[0] >> 6) & 0x3) + 1 + meta_dict_size = _read_unsigned(metadata, 1, meta_offset_size) + string_start = 1 + (meta_dict_size + 2) * meta_offset_size + + def _get_key(key_id): + off = _read_unsigned(metadata, 1 + (key_id + 1) * meta_offset_size, meta_offset_size) + nxt = _read_unsigned(metadata, 1 + (key_id + 2) * meta_offset_size, meta_offset_size) + return metadata[string_start + off:string_start + nxt].decode('utf-8') + def _lookup(size, id_size, offset_size, id_start, offset_start, data_start): - # Binary search for large objects, linear for small ones + # Linear scan for small objects, binary search for large ones. if size < _BINARY_SEARCH_THRESHOLD: for i in range(size): fid = _read_unsigned(self._value, id_start + id_size * i, id_size) - if key == _get_metadata_key(self._metadata, fid): + if key == _get_key(fid): offset = _read_unsigned( self._value, offset_start + offset_size * i, offset_size) return GenericVariant(self._value, self._metadata, @@ -905,7 +915,7 @@ def _lookup(size, id_size, offset_size, id_start, offset_start, data_start): while lo <= hi: mid = (lo + hi) >> 1 fid = _read_unsigned(self._value, id_start + id_size * mid, id_size) - cmp = _get_metadata_key(self._metadata, fid) + cmp = _get_key(fid) if cmp < key: lo = mid + 1 elif cmp > key: @@ -1018,7 +1028,6 @@ def _render_arr(size, offset_size, offset_start, data_start): dt = _EPOCH_DT_NTZ + datetime.timedelta(microseconds=micros) parts.append(_json.dumps(dt.strftime('%Y-%m-%d %H:%M:%S.%f'))) elif vtype == Type.BINARY: - import base64 parts.append(_json.dumps(base64.b64encode(sub.get_binary()).decode('ascii'))) elif vtype == Type.UUID: parts.append(_json.dumps(str(sub.get_uuid()))) @@ -1066,14 +1075,24 @@ def to_python(self): if vtype == Type.UUID: return str(self.get_uuid()) if vtype == Type.OBJECT: - result = {} - for i in range(self.object_size()): - key, child = self.get_field_at_index(i) - result[key] = child.to_python() - return result + def _build_dict(size, id_size, offset_size, id_start, offset_start, data_start): + result = {} + for i in range(size): + fid = _read_unsigned(self._value, id_start + id_size * i, id_size) + key = _get_metadata_key(self._metadata, fid) + offset = _read_unsigned(self._value, offset_start + offset_size * i, offset_size) + child = GenericVariant(self._value, self._metadata, data_start + offset) + result[key] = child.to_python() + return result + return _handle_object(self._value, self._pos, _build_dict) if vtype == Type.ARRAY: - return [self.get_element_at_index(i).to_python() - for i in range(self.array_size())] + def _build_list(size, offset_size, offset_start, data_start): + result = [] + for i in range(size): + offset = _read_unsigned(self._value, offset_start + offset_size * i, offset_size) + result.append(GenericVariant(self._value, self._metadata, data_start + offset).to_python()) + return result + return _handle_array(self._value, self._pos, _build_list) return None def variant_get(self, path: str, cast_type: str = None): diff --git a/paimon-python/pypaimon/table/row/generic_row.py b/paimon-python/pypaimon/table/row/generic_row.py index 4beeb99e42cd..4aa740de7219 100644 --- a/paimon-python/pypaimon/table/row/generic_row.py +++ b/paimon-python/pypaimon/table/row/generic_row.py @@ -152,7 +152,7 @@ def parse_field_value( elif type_name.startswith('TIME'): return cls._parse_time(bytes_data, field_offset) else: - raise ValueError(f"Unsupported type in BinaryRow deserialization: {type_name}") + return cls._parse_string(bytes_data, base_offset, field_offset) @classmethod def _parse_boolean(cls, bytes_data: bytes, field_offset: int) -> bool: diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index d12fe975b020..e0137f42d02e 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -672,8 +672,8 @@ def test_compact_conflict_shard_update(self): tc.close() print(f"Conflict detected as expected: {ctx.exception}") - def test_read_variant_table(self): - """Read a VARIANT-column table written by Java and verify the struct layout.""" + def test_py_read_variant_table(self): + """Python reads a VARIANT-column table written by Java (Java→Python E2E).""" table = self.catalog.get_table('default.variant_test') read_builder = table.new_read_builder() table_scan = read_builder.new_scan() @@ -723,10 +723,10 @@ def test_read_variant_table(self): gv_carol = GenericVariant.from_dict(payload_list[id_list.index(3)]) self.assertEqual(gv_carol.to_python(), [1, 2, 3]) - print(f"test_read_variant_table: verified {result.num_rows} VARIANT rows") + print(f"test_py_read_variant_table: verified {result.num_rows} VARIANT rows") def test_py_write_variant_table(self): - """Write a VARIANT-column table for Java to read back (Python→Java E2E). + """Python writes a VARIANT-column table for Java to read back (Python→Java E2E). Data written: id=1 payload={"name":"test","value":42} diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py index e66d77c8c162..7759c883d38d 100644 --- a/paimon-python/pypaimon/tests/variant_test.py +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -645,44 +645,5 @@ def test_all_primitive_types(self): self._check('{"n":null,"b":true,"i":42,"s":"hello","f":1.5}') -# =========================================================================== -# 14. BinaryRow – VARIANT is unsupported (raises, not silently corrupts) -# =========================================================================== - -class TestVariantBinaryRow(unittest.TestCase): - """VARIANT is not a valid key/partition type; BinaryRow does not support it. - - Both the serializer and deserializer must raise rather than silently - return corrupt data. - """ - - def _field(self): - return DataField(id=0, name='v', type=AtomicType('VARIANT')) - - def test_deserialize_raises(self): - """Deserializing a VARIANT field from BinaryRow raises ValueError.""" - from pypaimon.table.row.generic_row import GenericRow - # Build a BinaryRow that looks like it has a BINARY field at position 0. - # The exact bytes don't matter; the type dispatch must fail before reading. - field = self._field() - # Serialize a plain binary value via a different type to get a valid BinaryRow layout, - # then attempt to deserialize it as VARIANT. - binary_field = DataField(id=0, name='v', type=AtomicType('BYTES')) - row = GenericRow([b'\x00'], [binary_field]) - serialized = GenericRowSerializer.to_bytes(row) - with self.assertRaises(ValueError) as ctx: - GenericRowDeserializer.from_bytes(serialized, [field]) - self.assertIn('VARIANT', str(ctx.exception)) - - def test_serialize_raises(self): - """Serializing a VARIANT field via BinaryRow raises TypeError.""" - from pypaimon.table.row.generic_row import GenericRow - field = self._field() - row = GenericRow([{'value': b'\x00', 'metadata': b'\x01\x00'}], [field]) - with self.assertRaises(TypeError) as ctx: - GenericRowSerializer.to_bytes(row) - self.assertIn('VARIANT', str(ctx.exception)) - - if __name__ == '__main__': unittest.main() diff --git a/paimon-python/pypaimon/write/writer/data_writer.py b/paimon-python/pypaimon/write/writer/data_writer.py index d6744471e537..15a82e4343ea 100644 --- a/paimon-python/pypaimon/write/writer/data_writer.py +++ b/paimon-python/pypaimon/write/writer/data_writer.py @@ -70,6 +70,8 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op ) # Store the current generated external path to preserve scheme in metadata self._current_external_path: Optional[str] = None + # Guard so the VARIANT-format compatibility check runs only once per writer instance. + self._variant_format_checked: bool = False def write(self, data: pa.RecordBatch): try: @@ -155,7 +157,14 @@ def _check_and_roll_if_needed(self): self.pending_data = remaining_data def _check_no_variant_for_format(self, schema: pa.Schema): - """Raise NotImplementedError if any VARIANT column is present for an unsupported format.""" + """Raise NotImplementedError if any VARIANT column is present for an unsupported format. + + The check is performed only once per writer instance; subsequent calls are no-ops + because the schema and file format are both fixed for the lifetime of the writer. + """ + if self._variant_format_checked: + return + self._variant_format_checked = True if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): for field in schema: if pa.types.is_struct(field.type) and is_variant_struct(field.type): From 6bbb54f6a5b9a46aa3ec5f25aacedf506dd468ce Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:10:13 +0800 Subject: [PATCH 05/10] support VARIANT for pypaimon --- paimon-python/pypaimon/schema/data_types.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index debecf8d2564..615bb71cd04f 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -504,12 +504,6 @@ def from_paimon_type(data_type: DataType) -> pyarrow.DataType: elif type_name == 'BLOB': return pyarrow.large_binary() elif type_name == 'VARIANT': - # VARIANT is stored in Parquet as a struct with two non-nullable BINARY fields, - # matching Paimon Java's ParquetSchemaConverter encoding: - # required group { required binary value; required binary metadata; } - # 'value' holds the encoded variant payload (Parquet Variant binary spec). - # 'metadata' holds the key-dictionary for object field names. - # PyArrow reads this group transparently as pa.struct; no special reader needed. return pyarrow.struct([ pyarrow.field('value', pyarrow.binary(), nullable=False), pyarrow.field('metadata', pyarrow.binary(), nullable=False), @@ -625,9 +619,6 @@ def to_paimon_type(pa_type: pyarrow.DataType, nullable: bool) -> DataType: value_type = PyarrowFieldParser.to_paimon_type(pa_type.item_type, nullable) return MapType(nullable, key_type, value_type) elif types.is_struct(pa_type) and is_variant_struct(pa_type): - # Recognise the VARIANT encoding: a struct with exactly two non-nullable - # BINARY fields named 'value' and 'metadata'. Must be checked before the - # generic struct branch to avoid misclassifying it as a ROW type. return AtomicType('VARIANT', nullable) elif types.is_struct(pa_type): pa_type: pyarrow.StructType From f535dfba1d042b0fd9f4f5b22e95d7610fd23c1f Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:21:01 +0800 Subject: [PATCH 06/10] support VARIANT for pypaimon --- .../src/test/java/org/apache/paimon/JavaPyE2ETest.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java index d0309969dc64..44dffd65d147 100644 --- a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java +++ b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java @@ -995,8 +995,7 @@ public void testJavaWriteVariantTable() throws Exception { public void testJavaReadVariantTable() throws Exception { Identifier identifier = identifier("py_variant_test"); FileStoreTable table = (FileStoreTable) catalog.getTable(identifier); - List splits = - new ArrayList<>(table.newSnapshotReader().read().dataSplits()); + List splits = new ArrayList<>(table.newSnapshotReader().read().dataSplits()); TableRead read = table.newRead(); List res = getResult(read, splits, row -> internalRowToString(row, table.rowType())); @@ -1019,13 +1018,14 @@ public void testJavaReadVariantTable() throws Exception { assertThat(row.isNullAt(2)).isTrue(); } else { assertThat(row.isNullAt(2)).isFalse(); - org.apache.paimon.data.variant.Variant v = - row.getVariant(2); + org.apache.paimon.data.variant.Variant v = row.getVariant(2); assertThat(v).isNotNull(); } }); } - LOG.info("testJavaReadVariantTable: Java read {} VARIANT rows written by Python", res.size()); + LOG.info( + "testJavaReadVariantTable: Java read {} VARIANT rows written by Python", + res.size()); } /** Step 1: Write 5 base files for compact conflict test. */ From 3edec311725c05ec1f85d8e6a4280cbe7e6161e5 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:43:27 +0800 Subject: [PATCH 07/10] support VARIANT for pypaimon --- paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py | 1 - paimon-python/pypaimon/tests/variant_test.py | 1 - 2 files changed, 2 deletions(-) diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index e0137f42d02e..2331a1c6664a 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -769,4 +769,3 @@ def test_py_write_variant_table(self): table_write.close() table_commit.close() print(f"test_py_write_variant_table: wrote 4 VARIANT rows to {table_name}") - diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py index 7759c883d38d..6eb4d16ef07b 100644 --- a/paimon-python/pypaimon/tests/variant_test.py +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -49,7 +49,6 @@ RowType, is_variant_struct, ) -from pypaimon.table.row.generic_row import GenericRowDeserializer, GenericRowSerializer # --------------------------------------------------------------------------- From a457b27da47d1724cbce34a7442cb58707eac1b2 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:04:48 +0800 Subject: [PATCH 08/10] support VARIANT for pypaimon --- paimon-python/pypaimon/schema/data_types.py | 2 +- .../pypaimon/tests/e2e/java_py_read_write_test.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index 615bb71cd04f..d2271d9f0aff 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -469,7 +469,7 @@ def is_variant_struct(pa_type: pyarrow.StructType) -> bool: """ if pa_type.num_fields != 2: return False - f0, f1 = pa_type.field(0), pa_type.field(1) + f0, f1 = pa_type[0], pa_type[1] return ( f0.name == 'value' and pyarrow.types.is_binary(f0.type) and not f0.nullable and f1.name == 'metadata' and pyarrow.types.is_binary(f1.type) and not f1.nullable diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index 2331a1c6664a..37cc43ce1741 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -688,10 +688,10 @@ def test_py_read_variant_table(self): self.assertTrue(pa.types.is_struct(payload_field.type), f"Expected struct type for VARIANT, got {payload_field.type}") self.assertEqual(payload_field.type.num_fields, 2) - self.assertEqual(payload_field.type.field(0).name, 'value') - self.assertEqual(payload_field.type.field(1).name, 'metadata') - self.assertTrue(pa.types.is_binary(payload_field.type.field(0).type)) - self.assertTrue(pa.types.is_binary(payload_field.type.field(1).type)) + self.assertEqual(payload_field.type[0].name, 'value') + self.assertEqual(payload_field.type[1].name, 'metadata') + self.assertTrue(pa.types.is_binary(payload_field.type[0].type)) + self.assertTrue(pa.types.is_binary(payload_field.type[1].type)) # All rows should have non-null payload structs payload_col = result.column('payload') From b6c0787a14a95e41285b4ff4a1a33bfa2164f287 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:10:52 +0800 Subject: [PATCH 09/10] support VARIANT for pypaimon --- paimon-python/pypaimon/data/generic_variant.py | 2 +- paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py index e46d6ba7114f..d5883dde828e 100644 --- a/paimon-python/pypaimon/data/generic_variant.py +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -782,7 +782,7 @@ def to_arrow_array(cls, variants): return _pa.StructArray.from_arrays( [_pa.array(values, type=_pa.binary()), _pa.array(metadatas, type=_pa.binary())], - fields=[variant_type.field(0), variant_type.field(1)], + fields=[variant_type[0], variant_type[1]], mask=_pa.array(mask, type=_pa.bool_()), ) diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index 37cc43ce1741..9ae23dd8f467 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -746,14 +746,15 @@ def test_py_write_variant_table(self): schema = Schema.from_pyarrow_schema(pa_schema, options={'bucket': '-1'}) table_name = 'default.py_variant_test' - self.catalog.create_table(table_name, schema, True) + self.catalog.drop_table(table_name, True) + self.catalog.create_table(table_name, schema, False) table = self.catalog.get_table(table_name) variant_col = GenericVariant.to_arrow_array([ GenericVariant.from_json('{"name":"test","value":42}'), GenericVariant.from_json('[10,20,30]'), GenericVariant.from_json('"hello"'), - GenericVariant.from_json('null'), + None, # SQL NULL at the column level, not a VARIANT containing JSON null ]) data = pa.table({ 'id': pa.array([1, 2, 3, 4], type=pa.int32()), From 5b65553d4fdf93c20ce3009ce200f56e8d5c8cc2 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:30:51 +0800 Subject: [PATCH 10/10] support VARIANT for pypaimon --- docs/content/pypaimon/python-api.md | 14 +++++++------- paimon-python/pypaimon/data/generic_variant.py | 13 ++++++++++--- .../pypaimon/tests/e2e/java_py_read_write_test.py | 6 +++--- paimon-python/pypaimon/tests/variant_test.py | 10 +++++----- 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/docs/content/pypaimon/python-api.md b/docs/content/pypaimon/python-api.md index 109d34ea21cc..26c39bc66c95 100644 --- a/docs/content/pypaimon/python-api.md +++ b/docs/content/pypaimon/python-api.md @@ -736,11 +736,11 @@ from pypaimon.data.generic_variant import GenericVariant read_builder = table.new_read_builder() result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits()) -for row in result.column("payload").to_pylist(): - if row is not None: - gv = GenericVariant.from_dict(row) # wrap raw bytes - print(gv.to_python()) # decode to Python object - print(gv.variant_get("$.city", "string")) # path extraction +for record in result.to_pylist(): + if (payload := record["payload"]) is not None: + gv = GenericVariant.from_arrow_struct(payload) + print(gv.to_python()) # decode to Python object + print(gv.variant_get("$.city", "string")) # path extraction ``` **Writing a VARIANT column:** @@ -773,7 +773,7 @@ table_commit.close() |:-------|:------------| | `GenericVariant.from_json(json_str)` | Build from a JSON string | | `GenericVariant.from_python(obj)` | Build from a Python object (`dict`, `list`, `int`, `str`, …) | -| `GenericVariant.from_dict({"value": b"...", "metadata": b"..."})` | Wrap raw bytes from an Arrow VARIANT struct row | +| `GenericVariant.from_arrow_struct({"value": b"...", "metadata": b"..."})` | Wrap raw bytes from an Arrow VARIANT struct row (read path) | | `GenericVariant.to_arrow_array([gv1, gv2, None, ...])` | Convert a list of `GenericVariant` (or `None`) to a `pa.StructArray` for writing | | `gv.to_python()` | Decode to native Python (`dict`, `list`, `int`, `str`, `None`, …) | | `gv.to_json()` | Decode to a JSON string | @@ -785,7 +785,7 @@ table_commit.close() - `VARIANT` is only supported with Parquet file format. Writing to ORC or Avro raises `NotImplementedError`. - `VARIANT` cannot be used as a primary key or partition key. - Shredded VARIANT files (written by Paimon Java with `typed_value` sub-fields) are readable - via the raw `from_dict` path, but the extra fields are not automatically interpreted. + via the raw `from_arrow_struct` path, but the extra fields are not automatically interpreted. ## Predicate diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py index d5883dde828e..9e7018de1e26 100644 --- a/paimon-python/pypaimon/data/generic_variant.py +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -709,7 +709,7 @@ class GenericVariant: # Construct from raw bytes (e.g. what to_arrow() returns for a VARIANT column) row = result.column('payload')[0].as_py() # {'value': bytes, 'metadata': bytes} - v = GenericVariant.from_dict(row) + v = GenericVariant.from_arrow_struct(row) print(v.to_python()) # {'age': 30, 'city': 'Beijing'} """ @@ -741,8 +741,15 @@ def from_python(cls, obj) -> 'GenericVariant': return builder.result() @classmethod - def from_dict(cls, d: dict) -> 'GenericVariant': - """Wrap raw bytes from a PyArrow VARIANT struct: {'value': bytes, 'metadata': bytes}.""" + def from_arrow_struct(cls, d: dict) -> 'GenericVariant': + """Wrap raw bytes from a PyArrow VARIANT struct: {'value': bytes, 'metadata': bytes}. + + Use this on the read path after calling ``column.to_pylist()`` on a VARIANT column:: + + for row in result.column("payload").to_pylist(): + if row is not None: + gv = GenericVariant.from_arrow_struct(row) + """ return cls(bytes(d['value']), bytes(d['metadata'])) @classmethod diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index 9ae23dd8f467..552a8f044e8b 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -710,17 +710,17 @@ def test_py_read_variant_table(self): payload_list = result_sorted.column('payload').to_pylist() # Row 1: Alice, {"age":30,"city":"Beijing"} - gv_alice = GenericVariant.from_dict(payload_list[id_list.index(1)]) + gv_alice = GenericVariant.from_arrow_struct(payload_list[id_list.index(1)]) self.assertEqual(gv_alice.variant_get('$.age', 'int'), 30) self.assertEqual(gv_alice.variant_get('$.city', 'string'), 'Beijing') # Row 2: Bob, {"age":25,"city":"Shanghai"} - gv_bob = GenericVariant.from_dict(payload_list[id_list.index(2)]) + gv_bob = GenericVariant.from_arrow_struct(payload_list[id_list.index(2)]) self.assertEqual(gv_bob.variant_get('$.age', 'int'), 25) self.assertEqual(gv_bob.variant_get('$.city', 'string'), 'Shanghai') # Row 3: Carol, [1,2,3] - gv_carol = GenericVariant.from_dict(payload_list[id_list.index(3)]) + gv_carol = GenericVariant.from_arrow_struct(payload_list[id_list.index(3)]) self.assertEqual(gv_carol.to_python(), [1, 2, 3]) print(f"test_py_read_variant_table: verified {result.num_rows} VARIANT rows") diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py index 6eb4d16ef07b..06b5f4f26bde 100644 --- a/paimon-python/pypaimon/tests/variant_test.py +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -538,14 +538,14 @@ def test_string_cast_on_int(self): class TestConstructors(unittest.TestCase): - def test_from_dict_roundtrip(self): + def test_from_arrow_struct_roundtrip(self): original = GenericVariant.from_json('{"x":1,"y":2}') - restored = GenericVariant.from_dict({'value': original.value(), 'metadata': original.metadata()}) + restored = GenericVariant.from_arrow_struct({'value': original.value(), 'metadata': original.metadata()}) self.assertEqual(restored.to_json(), original.to_json()) - def test_from_dict_array(self): + def test_from_arrow_struct_array(self): original = GenericVariant.from_json('[1,2,3]') - restored = GenericVariant.from_dict({'value': original.value(), 'metadata': original.metadata()}) + restored = GenericVariant.from_arrow_struct({'value': original.value(), 'metadata': original.metadata()}) self.assertEqual(restored.get_type(), Type.ARRAY) self.assertEqual(restored.to_python(), [1, 2, 3]) @@ -578,7 +578,7 @@ def test_basic(self): arr = GenericVariant.to_arrow_array([gv1, gv2]) self.assertIsInstance(arr, pa.StructArray) self.assertEqual(len(arr), 2) - restored = GenericVariant.from_dict(arr[0].as_py()) + restored = GenericVariant.from_arrow_struct(arr[0].as_py()) self.assertEqual(restored.variant_get('$.a', 'int'), 1) def test_with_nulls(self):