From 13b431cf0e164d8753f888a1e14211008db18536 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Thu, 30 Apr 2026 12:43:09 +0200 Subject: [PATCH 01/39] Separate public API from impl of comparison functions This commit separates the actual implementation from the public `*Equal` functions. Now, all the public API does is convert the `opts ...EqualOption` into an `opt equalOption` struct and pass it into the implementation. This is useful to avoid having to reconstruct back and forth between the two in nested call stacks. All the implementations care about is `equalOption`, and `EqualOption` remains as a convenient thing only for the public API. --- arrow/array/compare.go | 70 +++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index d8a9552e..19252364 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -26,7 +26,11 @@ import ( ) // RecordEqual reports whether the two provided records are equal. -func RecordEqual(left, right arrow.RecordBatch) bool { +func RecordEqual(left, right arrow.RecordBatch, opts ...EqualOption) bool { + return recordEqual(left, right, newEqualOption(opts...)) +} + +func recordEqual(left, right arrow.RecordBatch, opt equalOption) bool { switch { case left.NumCols() != right.NumCols(): return false @@ -37,7 +41,7 @@ func RecordEqual(left, right arrow.RecordBatch) bool { for i := range left.Columns() { lc := left.Column(i) rc := right.Column(i) - if !Equal(lc, rc) { + if !equal(lc, rc, opt) { return false } } @@ -47,6 +51,10 @@ func RecordEqual(left, right arrow.RecordBatch) bool { // RecordApproxEqual reports whether the two provided records are approximately equal. // For non-floating point columns, it is equivalent to RecordEqual. func RecordApproxEqual(left, right arrow.RecordBatch, opts ...EqualOption) bool { + return recordApproxEqual(left, right, newEqualOption(opts...)) +} + +func recordApproxEqual(left, right arrow.RecordBatch, opt equalOption) bool { switch { case left.NumCols() != right.NumCols(): return false @@ -54,8 +62,6 @@ func RecordApproxEqual(left, right arrow.RecordBatch, opts ...EqualOption) bool return false } - opt := newEqualOption(opts...) - for i := range left.Columns() { lc := left.Column(i) rc := right.Column(i) @@ -106,7 +112,11 @@ func chunkedBinaryApply(left, right *arrow.Chunked, fn func(left arrow.Array, lb } // ChunkedEqual reports whether two chunked arrays are equal regardless of their chunkings -func ChunkedEqual(left, right *arrow.Chunked) bool { +func ChunkedEqual(left, right *arrow.Chunked, opts ...EqualOption) bool { + return chunkedEqual(left, right, newEqualOption(opts...)) +} + +func chunkedEqual(left, right *arrow.Chunked, opt equalOption) bool { switch { case left == right: return true @@ -130,6 +140,10 @@ func ChunkedEqual(left, right *arrow.Chunked) bool { // ChunkedApproxEqual reports whether two chunked arrays are approximately equal regardless of their chunkings // for non-floating point arrays, this is equivalent to ChunkedEqual func ChunkedApproxEqual(left, right *arrow.Chunked, opts ...EqualOption) bool { + return chunkedApproxEqual(left, right, newEqualOption(opts...)) +} + +func chunkedApproxEqual(left, right *arrow.Chunked, opt equalOption) bool { switch { case left == right: return true @@ -143,7 +157,7 @@ func ChunkedApproxEqual(left, right *arrow.Chunked, opts ...EqualOption) bool { var isequal = true chunkedBinaryApply(left, right, func(left arrow.Array, lbeg, lend int64, right arrow.Array, rbeg, rend int64) bool { - isequal = SliceApproxEqual(left, lbeg, lend, right, rbeg, rend, opts...) + isequal = sliceApproxEqual(left, lbeg, lend, right, rbeg, rend, opt) return isequal }) @@ -151,7 +165,11 @@ func ChunkedApproxEqual(left, right *arrow.Chunked, opts ...EqualOption) bool { } // TableEqual returns if the two tables have the same data in the same schema -func TableEqual(left, right arrow.Table) bool { +func TableEqual(left, right arrow.Table, opts ...EqualOption) bool { + return tableEqual(left, right, newEqualOption(opts...)) +} + +func tableEqual(left, right arrow.Table, opt equalOption) bool { switch { case left.NumCols() != right.NumCols(): return false @@ -166,15 +184,19 @@ func TableEqual(left, right arrow.Table) bool { return false } - if !ChunkedEqual(lc.Data(), rc.Data()) { + if !chunkedEqual(lc.Data(), rc.Data(), opt) { return false } } return true } -// TableEqual returns if the two tables have the approximately equal data in the same schema +// TableApproxEqual returns if the two tables have the approximately equal data in the same schema func TableApproxEqual(left, right arrow.Table, opts ...EqualOption) bool { + return tableApproxEqual(left, right, newEqualOption(opts...)) +} + +func tableApproxEqual(left, right arrow.Table, opt equalOption) bool { switch { case left.NumCols() != right.NumCols(): return false @@ -189,7 +211,7 @@ func TableApproxEqual(left, right arrow.Table, opts ...EqualOption) bool { return false } - if !ChunkedApproxEqual(lc.Data(), rc.Data(), opts...) { + if !chunkedApproxEqual(lc.Data(), rc.Data(), opt) { return false } } @@ -197,9 +219,13 @@ func TableApproxEqual(left, right arrow.Table, opts ...EqualOption) bool { } // Equal reports whether the two provided arrays are equal. -func Equal(left, right arrow.Array) bool { +func Equal(left, right arrow.Array, opts ...EqualOption) bool { + return equal(left, right, newEqualOption(opts...)) +} + +func equal(left, right arrow.Array, opt equalOption) bool { switch { - case !baseArrayEqual(left, right): + case !baseArrayEqual(left, right, opt): return false case left.Len() == 0: return true @@ -345,26 +371,29 @@ func Equal(left, right arrow.Array) bool { return arrayDenseUnionEqual(l, r) case *RunEndEncoded: r := right.(*RunEndEncoded) - return arrayRunEndEncodedEqual(l, r) + return arrayRunEndEncodedEqual(l, r, opt) default: panic(fmt.Errorf("arrow/array: unknown array type %T", l)) } } // SliceEqual reports whether slices left[lbeg:lend] and right[rbeg:rend] are equal. -func SliceEqual(left arrow.Array, lbeg, lend int64, right arrow.Array, rbeg, rend int64) bool { +func SliceEqual(left arrow.Array, lbeg, lend int64, right arrow.Array, rbeg, rend int64, opts ...EqualOption) bool { + return sliceEqual(left, lbeg, lend, right, rbeg, rend, newEqualOption(opts...)) +} + +func sliceEqual(left arrow.Array, lbeg, lend int64, right arrow.Array, rbeg, rend int64, opt equalOption) bool { l := NewSlice(left, lbeg, lend) defer l.Release() r := NewSlice(right, rbeg, rend) defer r.Release() - return Equal(l, r) + return equal(l, r, opt) } // SliceApproxEqual reports whether slices left[lbeg:lend] and right[rbeg:rend] are approximately equal. func SliceApproxEqual(left arrow.Array, lbeg, lend int64, right arrow.Array, rbeg, rend int64, opts ...EqualOption) bool { - opt := newEqualOption(opts...) - return sliceApproxEqual(left, lbeg, lend, right, rbeg, rend, opt) + return sliceApproxEqual(left, lbeg, lend, right, rbeg, rend, newEqualOption(opts...)) } func sliceApproxEqual(left arrow.Array, lbeg, lend int64, right arrow.Array, rbeg, rend int64, opt equalOption) bool { @@ -455,13 +484,12 @@ func WithUnorderedMapKeys(v bool) EqualOption { // ApproxEqual reports whether the two provided arrays are approximately equal. // For non-floating point arrays, it is equivalent to Equal. func ApproxEqual(left, right arrow.Array, opts ...EqualOption) bool { - opt := newEqualOption(opts...) - return arrayApproxEqual(left, right, opt) + return arrayApproxEqual(left, right, newEqualOption(opts...)) } func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { switch { - case !baseArrayEqual(left, right): + case !baseArrayEqual(left, right, opt): return false case left.Len() == 0: return true @@ -616,7 +644,7 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { } } -func baseArrayEqual(left, right arrow.Array) bool { +func baseArrayEqual(left, right arrow.Array, opt equalOption) bool { switch { case left.Len() != right.Len(): return false From a231fa1378898514b07b95bf24679378dfe97463 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Thu, 30 Apr 2026 12:48:06 +0200 Subject: [PATCH 02/39] Add `nullable` to `equalOption` This allows for 2 things: 1. Users can now explicitly pass into `EqualOption` if they want the comparison functions to compare the nullable buffer or not. 2. Struct and record comparison can change the value of the `nullable` option depending on `innerField.Nullable`, making the comparison semantically accurate. --- arrow/array/compare.go | 64 ++++++++++++++++++++++++++++--------- arrow/array/compare_test.go | 43 +++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 15 deletions(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index 19252364..c8e42aff 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -39,6 +39,14 @@ func recordEqual(left, right arrow.RecordBatch, opt equalOption) bool { } for i := range left.Columns() { + lf := left.Schema().Field(i) + rf := left.Schema().Field(i) + if !lf.Equal(rf) { + return false + } + + opt.nullable = lf.Nullable + lc := left.Column(i) rc := right.Column(i) if !equal(lc, rc, opt) { @@ -63,6 +71,14 @@ func recordApproxEqual(left, right arrow.RecordBatch, opt equalOption) bool { } for i := range left.Columns() { + lf := left.Schema().Field(i) + rf := left.Schema().Field(i) + if !lf.Equal(rf) { + return false + } + + opt.nullable = lf.Nullable + lc := left.Column(i) rc := right.Column(i) if !arrayApproxEqual(lc, rc, opt) { @@ -122,7 +138,7 @@ func chunkedEqual(left, right *arrow.Chunked, opt equalOption) bool { return true case left.Len() != right.Len(): return false - case left.NullN() != right.NullN(): + case opt.nullable && left.NullN() != right.NullN(): return false case !arrow.TypeEqual(left.DataType(), right.DataType()): return false @@ -149,7 +165,7 @@ func chunkedApproxEqual(left, right *arrow.Chunked, opt equalOption) bool { return true case left.Len() != right.Len(): return false - case left.NullN() != right.NullN(): + case opt.nullable && left.NullN() != right.NullN(): return false case !arrow.TypeEqual(left.DataType(), right.DataType()): return false @@ -178,12 +194,16 @@ func tableEqual(left, right arrow.Table, opt equalOption) bool { } for i := 0; int64(i) < left.NumCols(); i++ { - lc := left.Column(i) - rc := right.Column(i) - if !lc.Field().Equal(rc.Field()) { + lf := left.Schema().Field(i) + rf := left.Schema().Field(i) + if !lf.Equal(rf) { return false } + opt.nullable = lf.Nullable + + lc := left.Column(i) + rc := right.Column(i) if !chunkedEqual(lc.Data(), rc.Data(), opt) { return false } @@ -205,12 +225,16 @@ func tableApproxEqual(left, right arrow.Table, opt equalOption) bool { } for i := 0; int64(i) < left.NumCols(); i++ { - lc := left.Column(i) - rc := right.Column(i) - if !lc.Field().Equal(rc.Field()) { + lf := left.Schema().Field(i) + rf := left.Schema().Field(i) + if !lf.Equal(rf) { return false } + opt.nullable = lf.Nullable + + lc := left.Column(i) + rc := right.Column(i) if !chunkedApproxEqual(lc.Data(), rc.Data(), opt) { return false } @@ -229,7 +253,7 @@ func equal(left, right arrow.Array, opt equalOption) bool { return false case left.Len() == 0: return true - case left.NullN() == left.Len(): + case opt.nullable && left.NullN() == left.Len(): return true } @@ -411,6 +435,7 @@ type equalOption struct { atol float64 // absolute tolerance nansEq bool // whether NaNs are considered equal. unorderedMapKeys bool // whether maps are allowed to have different entries order + nullable bool // whether the fields being compared are considered nullable } func (eq equalOption) f16(f1, f2 float16.Num) bool { @@ -446,8 +471,9 @@ func (eq equalOption) f64(v1, v2 float64) bool { func newEqualOption(opts ...EqualOption) equalOption { eq := equalOption{ - atol: defaultAbsoluteTolerance, - nansEq: false, + atol: defaultAbsoluteTolerance, + nansEq: false, + nullable: true, } for _, opt := range opts { opt(&eq) @@ -481,6 +507,14 @@ func WithUnorderedMapKeys(v bool) EqualOption { } } +// WithNullable sets whether the comparison function will consider both fields as nullable. If they're non-nullable, their +// valids buffer will be ignored for comparison and the underlying values will be used instead +func WithNullable(v bool) EqualOption { + return func(o *equalOption) { + o.nullable = v + } +} + // ApproxEqual reports whether the two provided arrays are approximately equal. // For non-floating point arrays, it is equivalent to Equal. func ApproxEqual(left, right arrow.Array, opts ...EqualOption) bool { @@ -493,7 +527,7 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { return false case left.Len() == 0: return true - case left.NullN() == left.Len(): + case opt.nullable && left.NullN() == left.Len(): return true } @@ -648,11 +682,11 @@ func baseArrayEqual(left, right arrow.Array, opt equalOption) bool { switch { case left.Len() != right.Len(): return false - case left.NullN() != right.NullN(): + case opt.nullable && left.NullN() != right.NullN(): return false case !arrow.TypeEqual(left.DataType(), right.DataType()): // We do not check for metadata as in the C++ implementation. return false - case !validityBitmapEqual(left, right): + case opt.nullable && !validityBitmapEqual(left, right): return false } return true @@ -882,7 +916,7 @@ func arrayApproxEqualSingleMapEntry(left, right *Struct, opt equalOption) bool { switch { case left.Len() != right.Len(): return false - case left.NullN() != right.NullN(): + case opt.nullable && left.NullN() != right.NullN(): return false case !arrow.TypeEqual(left.DataType(), right.DataType()): // We do not check for metadata as in the C++ implementation. return false diff --git a/arrow/array/compare_test.go b/arrow/array/compare_test.go index 0671def6..9de3445f 100644 --- a/arrow/array/compare_test.go +++ b/arrow/array/compare_test.go @@ -391,6 +391,49 @@ func TestArrayApproxEqualFloats(t *testing.T) { } } +func TestArrayEqualNonNullable(t *testing.T) { + for name, recs := range arrdata.Records { + t.Run(name, func(t *testing.T) { + rec := recs[0] + + // Clone the schema and make everything non-nullable + fields := rec.Schema().Fields() + meta := rec.Schema().Metadata() + for i := range fields { + fields[i].Nullable = false + } + schema := arrow.NewSchema(fields, &meta) + + for i, rawCol := range rec.Columns() { + // make a clone of the column with NullN=0 + col := array.MakeFromData(array.NewData( + rawCol.DataType(), + rawCol.Len(), + rawCol.Data().Buffers(), + rawCol.Data().Children(), + 0, + 0, + )) + t.Run(schema.Field(i).Name, func(t *testing.T) { + arr := col + if !array.Equal(arr, arr, array.WithNullable(false)) { + t.Fatalf("identical arrays should compare equal:\narray=%v", arr) + } + sub1 := array.NewSlice(arr, 1, int64(arr.Len())) + defer sub1.Release() + + sub2 := array.NewSlice(arr, 0, int64(arr.Len()-1)) + defer sub2.Release() + + if array.Equal(sub1, sub2) && name != "nulls" { + t.Fatalf("non-identical arrays should not compare equal:\nsub1=%v\nsub2=%v\narrf=%v\n", sub1, sub2, arr) + } + }) + } + }) + } +} + func testStringMap(mem memory.Allocator, m map[string]string, keys []string) *array.Map { dt := arrow.MapOf(arrow.BinaryTypes.String, arrow.BinaryTypes.String) builder := array.NewMapBuilderWithType(mem, dt) From e8238e1693790dbf2928579586d40ca3f903138c Mon Sep 17 00:00:00 2001 From: serramatutu Date: Thu, 30 Apr 2026 13:16:33 +0200 Subject: [PATCH 03/39] Check for nullable opt in most array compare implementations --- arrow/array/binary.go | 12 +- arrow/array/boolean.go | 4 +- arrow/array/compare.go | 188 ++++++++++++++++---------------- arrow/array/decimal.go | 4 +- arrow/array/dictionary.go | 4 +- arrow/array/encoded.go | 4 +- arrow/array/extension.go | 4 +- arrow/array/fixed_size_list.go | 6 +- arrow/array/fixedsize_binary.go | 4 +- arrow/array/interval.go | 12 +- arrow/array/list.go | 24 ++-- arrow/array/map.go | 4 +- arrow/array/numeric_generic.go | 4 +- arrow/array/string.go | 12 +- arrow/array/struct.go | 5 +- arrow/array/timestamp.go | 4 +- 16 files changed, 150 insertions(+), 145 deletions(-) diff --git a/arrow/array/binary.go b/arrow/array/binary.go index a8e77ae9..4ddfdbf6 100644 --- a/arrow/array/binary.go +++ b/arrow/array/binary.go @@ -223,9 +223,9 @@ func (a *Binary) ValidateFull() error { return nil } -func arrayEqualBinary(left, right *Binary) bool { +func arrayEqualBinary(left, right *Binary, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !bytes.Equal(left.Value(i), right.Value(i)) { @@ -417,9 +417,9 @@ func (a *LargeBinary) ValidateFull() error { return nil } -func arrayEqualLargeBinary(left, right *LargeBinary) bool { +func arrayEqualLargeBinary(left, right *LargeBinary, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !bytes.Equal(left.Value(i), right.Value(i)) { @@ -539,10 +539,10 @@ func (a *BinaryView) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualBinaryView(left, right *BinaryView) bool { +func arrayEqualBinaryView(left, right *BinaryView, opt equalOption) bool { leftBufs, rightBufs := left.dataBuffers, right.dataBuffers for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { diff --git a/arrow/array/boolean.go b/arrow/array/boolean.go index d579fa0c..e57dd3e6 100644 --- a/arrow/array/boolean.go +++ b/arrow/array/boolean.go @@ -109,9 +109,9 @@ func (a *Boolean) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualBoolean(left, right *Boolean) bool { +func arrayEqualBoolean(left, right *Boolean, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { diff --git a/arrow/array/compare.go b/arrow/array/compare.go index c8e42aff..ec56cb6d 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -266,127 +266,127 @@ func equal(left, right arrow.Array, opt equalOption) bool { return true case *Boolean: r := right.(*Boolean) - return arrayEqualBoolean(l, r) + return arrayEqualBoolean(l, r, opt) case *FixedSizeBinary: r := right.(*FixedSizeBinary) - return arrayEqualFixedSizeBinary(l, r) + return arrayEqualFixedSizeBinary(l, r, opt) case *Binary: r := right.(*Binary) - return arrayEqualBinary(l, r) + return arrayEqualBinary(l, r, opt) case *String: r := right.(*String) - return arrayEqualString(l, r) + return arrayEqualString(l, r, opt) case *LargeBinary: r := right.(*LargeBinary) - return arrayEqualLargeBinary(l, r) + return arrayEqualLargeBinary(l, r, opt) case *LargeString: r := right.(*LargeString) - return arrayEqualLargeString(l, r) + return arrayEqualLargeString(l, r, opt) case *BinaryView: r := right.(*BinaryView) - return arrayEqualBinaryView(l, r) + return arrayEqualBinaryView(l, r, opt) case *StringView: r := right.(*StringView) - return arrayEqualStringView(l, r) + return arrayEqualStringView(l, r, opt) case *Int8: r := right.(*Int8) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Int16: r := right.(*Int16) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Int32: r := right.(*Int32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Int64: r := right.(*Int64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint8: r := right.(*Uint8) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint16: r := right.(*Uint16) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint32: r := right.(*Uint32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint64: r := right.(*Uint64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Float16: r := right.(*Float16) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Float32: r := right.(*Float32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Float64: r := right.(*Float64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Decimal32: r := right.(*Decimal32) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Decimal64: r := right.(*Decimal64) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Decimal128: r := right.(*Decimal128) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Decimal256: r := right.(*Decimal256) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Date32: r := right.(*Date32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Date64: r := right.(*Date64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Time32: r := right.(*Time32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Time64: r := right.(*Time64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Timestamp: r := right.(*Timestamp) - return arrayEqualTimestamp(l, r) + return arrayEqualTimestamp(l, r, opt) case *List: r := right.(*List) - return arrayEqualList(l, r) + return arrayEqualList(l, r, opt) case *LargeList: r := right.(*LargeList) - return arrayEqualLargeList(l, r) + return arrayEqualLargeList(l, r, opt) case *ListView: r := right.(*ListView) - return arrayEqualListView(l, r) + return arrayEqualListView(l, r, opt) case *LargeListView: r := right.(*LargeListView) - return arrayEqualLargeListView(l, r) + return arrayEqualLargeListView(l, r, opt) case *FixedSizeList: r := right.(*FixedSizeList) - return arrayEqualFixedSizeList(l, r) + return arrayEqualFixedSizeList(l, r, opt) case *Struct: r := right.(*Struct) - return arrayEqualStruct(l, r) + return arrayEqualStruct(l, r, opt) case *MonthInterval: r := right.(*MonthInterval) - return arrayEqualMonthInterval(l, r) + return arrayEqualMonthInterval(l, r, opt) case *DayTimeInterval: r := right.(*DayTimeInterval) - return arrayEqualDayTimeInterval(l, r) + return arrayEqualDayTimeInterval(l, r, opt) case *MonthDayNanoInterval: r := right.(*MonthDayNanoInterval) - return arrayEqualMonthDayNanoInterval(l, r) + return arrayEqualMonthDayNanoInterval(l, r, opt) case *Duration: r := right.(*Duration) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Map: r := right.(*Map) - return arrayEqualMap(l, r) + return arrayEqualMap(l, r, opt) case ExtensionArray: r := right.(ExtensionArray) - return arrayEqualExtension(l, r) + return arrayEqualExtension(l, r, opt) case *Dictionary: r := right.(*Dictionary) - return arrayEqualDict(l, r) + return arrayEqualDict(l, r, opt) case *SparseUnion: r := right.(*SparseUnion) return arraySparseUnionEqual(l, r) @@ -540,52 +540,52 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { return true case *Boolean: r := right.(*Boolean) - return arrayEqualBoolean(l, r) + return arrayEqualBoolean(l, r, opt) case *FixedSizeBinary: r := right.(*FixedSizeBinary) - return arrayEqualFixedSizeBinary(l, r) + return arrayEqualFixedSizeBinary(l, r, opt) case *Binary: r := right.(*Binary) - return arrayEqualBinary(l, r) + return arrayEqualBinary(l, r, opt) case *String: r := right.(*String) - return arrayApproxEqualString(l, r) + return arrayApproxEqualString(l, r, opt) case *LargeBinary: r := right.(*LargeBinary) - return arrayEqualLargeBinary(l, r) + return arrayEqualLargeBinary(l, r, opt) case *LargeString: r := right.(*LargeString) - return arrayApproxEqualLargeString(l, r) + return arrayApproxEqualLargeString(l, r, opt) case *BinaryView: r := right.(*BinaryView) - return arrayEqualBinaryView(l, r) + return arrayEqualBinaryView(l, r, opt) case *StringView: r := right.(*StringView) - return arrayApproxEqualStringView(l, r) + return arrayApproxEqualStringView(l, r, opt) case *Int8: r := right.(*Int8) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Int16: r := right.(*Int16) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Int32: r := right.(*Int32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Int64: r := right.(*Int64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint8: r := right.(*Uint8) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint16: r := right.(*Uint16) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint32: r := right.(*Uint32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Uint64: r := right.(*Uint64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Float16: r := right.(*Float16) return arrayApproxEqualFloat16(l, r, opt) @@ -597,31 +597,31 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { return arrayApproxEqualFloat64(l, r, opt) case *Decimal32: r := right.(*Decimal32) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Decimal64: r := right.(*Decimal64) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Decimal128: r := right.(*Decimal128) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Decimal256: r := right.(*Decimal256) - return arrayEqualDecimal(l, r) + return arrayEqualDecimal(l, r, opt) case *Date32: r := right.(*Date32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Date64: r := right.(*Date64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Time32: r := right.(*Time32) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Time64: r := right.(*Time64) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Timestamp: r := right.(*Timestamp) - return arrayEqualTimestamp(l, r) + return arrayEqualTimestamp(l, r, opt) case *List: r := right.(*List) return arrayApproxEqualList(l, r, opt) @@ -642,16 +642,16 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { return arrayApproxEqualStruct(l, r, opt) case *MonthInterval: r := right.(*MonthInterval) - return arrayEqualMonthInterval(l, r) + return arrayEqualMonthInterval(l, r, opt) case *DayTimeInterval: r := right.(*DayTimeInterval) - return arrayEqualDayTimeInterval(l, r) + return arrayEqualDayTimeInterval(l, r, opt) case *MonthDayNanoInterval: r := right.(*MonthDayNanoInterval) - return arrayEqualMonthDayNanoInterval(l, r) + return arrayEqualMonthDayNanoInterval(l, r, opt) case *Duration: r := right.(*Duration) - return arrayEqualFixedWidth(l, r) + return arrayEqualFixedWidth(l, r, opt) case *Map: r := right.(*Map) if opt.unorderedMapKeys { @@ -706,9 +706,9 @@ func validityBitmapEqual(left, right arrow.Array) bool { return true } -func arrayApproxEqualString(left, right *String) bool { +func arrayApproxEqualString(left, right *String, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) { @@ -718,9 +718,9 @@ func arrayApproxEqualString(left, right *String) bool { return true } -func arrayApproxEqualLargeString(left, right *LargeString) bool { +func arrayApproxEqualLargeString(left, right *LargeString, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) { @@ -730,9 +730,9 @@ func arrayApproxEqualLargeString(left, right *LargeString) bool { return true } -func arrayApproxEqualStringView(left, right *StringView) bool { +func arrayApproxEqualStringView(left, right *StringView, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) { @@ -744,7 +744,7 @@ func arrayApproxEqualStringView(left, right *StringView) bool { func arrayApproxEqualFloat16(left, right *Float16, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !opt.f16(left.Value(i), right.Value(i)) { @@ -756,7 +756,7 @@ func arrayApproxEqualFloat16(left, right *Float16, opt equalOption) bool { func arrayApproxEqualFloat32(left, right *Float32, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !opt.f32(left.Value(i), right.Value(i)) { @@ -768,7 +768,7 @@ func arrayApproxEqualFloat32(left, right *Float32, opt equalOption) bool { func arrayApproxEqualFloat64(left, right *Float64, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !opt.f64(left.Value(i), right.Value(i)) { @@ -780,7 +780,7 @@ func arrayApproxEqualFloat64(left, right *Float64, opt equalOption) bool { func arrayApproxEqualList(left, right *List, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -799,7 +799,7 @@ func arrayApproxEqualList(left, right *List, opt equalOption) bool { func arrayApproxEqualLargeList(left, right *LargeList, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -818,7 +818,7 @@ func arrayApproxEqualLargeList(left, right *LargeList, opt equalOption) bool { func arrayApproxEqualListView(left, right *ListView, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -837,7 +837,7 @@ func arrayApproxEqualListView(left, right *ListView, opt equalOption) bool { func arrayApproxEqualLargeListView(left, right *LargeListView, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -856,7 +856,7 @@ func arrayApproxEqualLargeListView(left, right *LargeListView, opt equalOption) func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -874,11 +874,15 @@ func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) } func arrayApproxEqualStruct(left, right *Struct, opt equalOption) bool { - return bitutils.VisitSetBitRuns( - left.NullBitmapBytes(), - int64(left.Offset()), int64(left.Len()), - approxEqualStructRun(left, right, opt), - ) == nil + visitFn := approxEqualStructRun(left, right, opt) + if opt.nullable { + return bitutils.VisitSetBitRuns( + left.NullBitmapBytes(), + int64(left.Offset()), int64(left.Len()), + visitFn, + ) == nil + } + return visitFn(0, int64(left.Len())) == nil } func approxEqualStructRun(left, right *Struct, opt equalOption) bitutils.VisitFn { @@ -895,7 +899,7 @@ func approxEqualStructRun(left, right *Struct, opt equalOption) bitutils.VisitFn // arrayApproxEqualMap doesn't care about the order of keys (in Go map traversal order is undefined) func arrayApproxEqualMap(left, right *Map, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !arrayApproxEqualSingleMapEntry(left.newListValue(i).(*Struct), right.newListValue(i).(*Struct), opt) { @@ -926,7 +930,7 @@ func arrayApproxEqualSingleMapEntry(left, right *Struct, opt equalOption) bool { used := make(map[int]bool, right.Len()) for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } @@ -936,7 +940,7 @@ func arrayApproxEqualSingleMapEntry(left, right *Struct, opt equalOption) bool { if used[j] { continue } - if right.IsNull(j) { + if opt.nullable && right.IsNull(j) { used[j] = true continue } diff --git a/arrow/array/decimal.go b/arrow/array/decimal.go index 704b1d93..b4e24f69 100644 --- a/arrow/array/decimal.go +++ b/arrow/array/decimal.go @@ -110,9 +110,9 @@ func (a *baseDecimal[T]) MarshalJSON() ([]byte, error) { func arrayEqualDecimal[T interface { decimal.DecimalTypes decimal.Num[T] -}](left, right *baseDecimal[T]) bool { +}](left, right *baseDecimal[T], opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index 38a43f77..9645b788 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -303,8 +303,8 @@ func (d *Dictionary) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualDict(l, r *Dictionary) bool { - return Equal(l.Dictionary(), r.Dictionary()) && Equal(l.indices, r.indices) +func arrayEqualDict(l, r *Dictionary, opt equalOption) bool { + return equal(l.Dictionary(), r.Dictionary(), opt) && equal(l.indices, r.indices, opt) } func arrayApproxEqualDict(l, r *Dictionary, opt equalOption) bool { diff --git a/arrow/array/encoded.go b/arrow/array/encoded.go index 8d628ffc..4cfc346f 100644 --- a/arrow/array/encoded.go +++ b/arrow/array/encoded.go @@ -260,14 +260,14 @@ func (r *RunEndEncoded) MarshalJSON() ([]byte, error) { return buf.Bytes(), nil } -func arrayRunEndEncodedEqual(l, r *RunEndEncoded) bool { +func arrayRunEndEncodedEqual(l, r *RunEndEncoded, opt equalOption) bool { // types were already checked before getting here, so we know // the encoded types are equal mr := encoded.NewMergedRuns([2]arrow.Array{l, r}) for mr.Next() { lIndex := mr.IndexIntoArray(0) rIndex := mr.IndexIntoArray(1) - if !SliceEqual(l.values, lIndex, lIndex+1, r.values, rIndex, rIndex+1) { + if !sliceEqual(l.values, lIndex, lIndex+1, r.values, rIndex, rIndex+1, opt) { return false } } diff --git a/arrow/array/extension.go b/arrow/array/extension.go index e509b5e0..21ed3b24 100644 --- a/arrow/array/extension.go +++ b/arrow/array/extension.go @@ -46,12 +46,12 @@ type ExtensionArray interface { // two extension arrays are equal if their data types are equal and // their underlying storage arrays are equal. -func arrayEqualExtension(l, r ExtensionArray) bool { +func arrayEqualExtension(l, r ExtensionArray, opt equalOption) bool { if !arrow.TypeEqual(l.DataType(), r.DataType()) { return false } - return Equal(l.Storage(), r.Storage()) + return equal(l.Storage(), r.Storage(), opt) } // two extension arrays are approximately equal if their data types are diff --git a/arrow/array/fixed_size_list.go b/arrow/array/fixed_size_list.go index d382ebe9..4b3ff286 100644 --- a/arrow/array/fixed_size_list.go +++ b/arrow/array/fixed_size_list.go @@ -84,9 +84,9 @@ func (a *FixedSizeList) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func arrayEqualFixedSizeList(left, right *FixedSizeList) bool { +func arrayEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -94,7 +94,7 @@ func arrayEqualFixedSizeList(left, right *FixedSizeList) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return Equal(l, r) + return equal(l, r, opt) }() if !o { return false diff --git a/arrow/array/fixedsize_binary.go b/arrow/array/fixedsize_binary.go index 31d507c5..44ca4488 100644 --- a/arrow/array/fixedsize_binary.go +++ b/arrow/array/fixedsize_binary.go @@ -106,9 +106,9 @@ func (a *FixedSizeBinary) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualFixedSizeBinary(left, right *FixedSizeBinary) bool { +func arrayEqualFixedSizeBinary(left, right *FixedSizeBinary, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !bytes.Equal(left.Value(i), right.Value(i)) { diff --git a/arrow/array/interval.go b/arrow/array/interval.go index 2c029c25..128cdfad 100644 --- a/arrow/array/interval.go +++ b/arrow/array/interval.go @@ -120,9 +120,9 @@ func (a *MonthInterval) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualMonthInterval(left, right *MonthInterval) bool { +func arrayEqualMonthInterval(left, right *MonthInterval, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { @@ -423,9 +423,9 @@ func (a *DayTimeInterval) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualDayTimeInterval(left, right *DayTimeInterval) bool { +func arrayEqualDayTimeInterval(left, right *DayTimeInterval, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { @@ -727,9 +727,9 @@ func (a *MonthDayNanoInterval) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualMonthDayNanoInterval(left, right *MonthDayNanoInterval) bool { +func arrayEqualMonthDayNanoInterval(left, right *MonthDayNanoInterval, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { diff --git a/arrow/array/list.go b/arrow/array/list.go index d72887bb..df45223f 100644 --- a/arrow/array/list.go +++ b/arrow/array/list.go @@ -129,9 +129,9 @@ func (a *List) MarshalJSON() ([]byte, error) { return buf.Bytes(), nil } -func arrayEqualList(left, right *List) bool { +func arrayEqualList(left, right *List, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -139,7 +139,7 @@ func arrayEqualList(left, right *List) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return Equal(l, r) + return equal(l, r, opt) }() if !o { return false @@ -261,9 +261,9 @@ func (a *LargeList) MarshalJSON() ([]byte, error) { return buf.Bytes(), nil } -func arrayEqualLargeList(left, right *LargeList) bool { +func arrayEqualLargeList(left, right *LargeList, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -271,7 +271,7 @@ func arrayEqualLargeList(left, right *LargeList) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return Equal(l, r) + return equal(l, r, opt) }() if !o { return false @@ -736,9 +736,9 @@ func (a *ListView) MarshalJSON() ([]byte, error) { return buf.Bytes(), nil } -func arrayEqualListView(left, right *ListView) bool { +func arrayEqualListView(left, right *ListView, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -746,7 +746,7 @@ func arrayEqualListView(left, right *ListView) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return Equal(l, r) + return equal(l, r, opt) }() if !o { return false @@ -883,9 +883,9 @@ func (a *LargeListView) MarshalJSON() ([]byte, error) { return buf.Bytes(), nil } -func arrayEqualLargeListView(left, right *LargeListView) bool { +func arrayEqualLargeListView(left, right *LargeListView, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } o := func() bool { @@ -893,7 +893,7 @@ func arrayEqualLargeListView(left, right *LargeListView) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return Equal(l, r) + return equal(l, r, opt) }() if !o { return false diff --git a/arrow/array/map.go b/arrow/array/map.go index 71d4d138..af6c2818 100644 --- a/arrow/array/map.go +++ b/arrow/array/map.go @@ -106,9 +106,9 @@ func (a *Map) Release() { a.items.Release() } -func arrayEqualMap(left, right *Map) bool { +func arrayEqualMap(left, right *Map, opt equalOption) bool { // since Map is implemented using a list, we can just use arrayEqualList - return arrayEqualList(left.List, right.List) + return arrayEqualList(left.List, right.List, opt) } type MapBuilder struct { diff --git a/arrow/array/numeric_generic.go b/arrow/array/numeric_generic.go index 1b671fc7..49a369cd 100644 --- a/arrow/array/numeric_generic.go +++ b/arrow/array/numeric_generic.go @@ -436,9 +436,9 @@ func NewDate64Data(data arrow.ArrayData) *Date64 { func (a *Date64) Date64Values() []arrow.Date64 { return a.Values() } -func arrayEqualFixedWidth[T arrow.FixedWidthType](left, right arrow.TypedArray[T]) bool { +func arrayEqualFixedWidth[T arrow.FixedWidthType](left, right arrow.TypedArray[T], opt equalOption) bool { for i := range left.Len() { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { diff --git a/arrow/array/string.go b/arrow/array/string.go index 00748576..accc6e95 100644 --- a/arrow/array/string.go +++ b/arrow/array/string.go @@ -231,9 +231,9 @@ func (a *String) ValidateFull() error { return nil } -func arrayEqualString(left, right *String) bool { +func arrayEqualString(left, right *String, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { @@ -435,9 +435,9 @@ func (a *LargeString) ValidateFull() error { return nil } -func arrayEqualLargeString(left, right *LargeString) bool { +func arrayEqualLargeString(left, right *LargeString, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { @@ -541,10 +541,10 @@ func (a *StringView) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualStringView(left, right *StringView) bool { +func arrayEqualStringView(left, right *StringView, opt equalOption) bool { leftBufs, rightBufs := left.dataBuffers, right.dataBuffers for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { diff --git a/arrow/array/struct.go b/arrow/array/struct.go index 07505de1..203dc1f7 100644 --- a/arrow/array/struct.go +++ b/arrow/array/struct.go @@ -239,10 +239,11 @@ func (a *Struct) MarshalJSON() ([]byte, error) { return buf.Bytes(), nil } -func arrayEqualStruct(left, right *Struct) bool { +func arrayEqualStruct(left, right *Struct, opt equalOption) bool { for i, lf := range left.fields { rf := right.fields[i] - if !Equal(lf, rf) { + opt.nullable = left.data.dtype.(*arrow.StructType).Field(i).Nullable + if !equal(lf, rf, opt) { return false } } diff --git a/arrow/array/timestamp.go b/arrow/array/timestamp.go index 5ac0ee6c..55e9b535 100644 --- a/arrow/array/timestamp.go +++ b/arrow/array/timestamp.go @@ -125,9 +125,9 @@ func (a *Timestamp) MarshalJSON() ([]byte, error) { return json.Marshal(vals) } -func arrayEqualTimestamp(left, right *Timestamp) bool { +func arrayEqualTimestamp(left, right *Timestamp, opt equalOption) bool { for i := 0; i < left.Len(); i++ { - if left.IsNull(i) { + if opt.nullable && left.IsNull(i) { continue } if left.Value(i) != right.Value(i) { From 48fde398be371f5184736f87a3e9dbafce23f830 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Tue, 28 Apr 2026 15:35:46 +0200 Subject: [PATCH 04/39] Add stricter tests to null JSON in record and struct --- arrow/array/record_test.go | 47 ++++++++++++++++++++++------ arrow/array/struct_test.go | 6 ++++ arrow/array/util_test.go | 63 +++++++++++++++++++++++++------------- 3 files changed, 86 insertions(+), 30 deletions(-) diff --git a/arrow/array/record_test.go b/arrow/array/record_test.go index a3924382..eabb84f3 100644 --- a/arrow/array/record_test.go +++ b/arrow/array/record_test.go @@ -17,6 +17,7 @@ package array_test import ( + "bytes" "fmt" "reflect" "strings" @@ -25,6 +26,7 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/apache/arrow-go/v18/internal/json" "github.com/stretchr/testify/assert" ) @@ -485,9 +487,9 @@ func TestRecordBuilder(t *testing.T) { mapDt.SetItemNullable(false) schema := arrow.NewSchema( []arrow.Field{ - {Name: "f1-i32", Type: arrow.PrimitiveTypes.Int32}, - {Name: "f2-f64", Type: arrow.PrimitiveTypes.Float64}, - {Name: "map", Type: mapDt}, + {Name: "f1-i32", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, + {Name: "f2-f64-notnull", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "map", Type: mapDt, Nullable: true}, }, nil, ) @@ -498,11 +500,14 @@ func TestRecordBuilder(t *testing.T) { b.Retain() b.Release() - b.Field(0).(*array.Int32Builder).AppendValues([]int32{1, 2, 3}, nil) + b.Field(0).(*array.Int32Builder).AppendNull() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{2, 3}, nil) b.Field(0).(*array.Int32Builder).AppendValues([]int32{4, 5}, nil) - b.Field(1).(*array.Float64Builder).AppendValues([]float64{1, 2, 3, 4, 5}, nil) + + b.Field(1).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2, 3.3, 4.4, 5.5}, nil) + mb := b.Field(2).(*array.MapBuilder) - for i := 0; i < 5; i++ { + for i := range 5 { mb.Append(true) if i%3 == 0 { @@ -511,6 +516,12 @@ func TestRecordBuilder(t *testing.T) { } } + err := b.UnmarshalJSON([]byte(`{"f1-i32": 6, "f2-f64-notnull": 6.6, "map": [{"key": "4": "value": "d"}]}`)) + assert.NoError(t, err) + + err = b.UnmarshalJSON([]byte(`{"f1-i32": null, "f2-f64-notnull": null, "map": null}`)) + assert.NoError(t, err) + rec := b.NewRecordBatch() defer rec.Release() @@ -518,7 +529,7 @@ func TestRecordBuilder(t *testing.T) { t.Fatalf("invalid schema: got=%#v, want=%#v", got, want) } - if got, want := rec.NumRows(), int64(5); got != want { + if got, want := rec.NumRows(), int64(7); got != want { t.Fatalf("invalid number of rows: got=%d, want=%d", got, want) } if got, want := rec.NumCols(), int64(3); got != want { @@ -527,9 +538,27 @@ func TestRecordBuilder(t *testing.T) { if got, want := rec.ColumnName(0), schema.Field(0).Name; got != want { t.Fatalf("invalid column name: got=%q, want=%q", got, want) } - if got, want := rec.Column(2).String(), `[{["0" "2" "3"] ["a" "b" "c"]} {[] []} {[] []} {["3" "2" "3"] ["a" "b" "c"]} {[] []}]`; got != want { - t.Fatalf("invalid column name: got=%q, want=%q", got, want) + + if got, want := rec.Column(0).String(), `[(null) 2 3 4 5 6 (null)]`; got != want { + t.Fatalf("invalid column values: got=%q, want=%q", got, want) + } + if got, want := rec.Column(1).String(), `[1.1 2.2 3.3 4.4 5.5 6.6 0]`; got != want { + t.Fatalf("invalid column values: got=%q, want=%q", got, want) } + if got, want := rec.Column(2).String(), `[{["0" "2" "3"] ["a" "b" "c"]} {[] []} {[] []} {["3" "2" "3"] ["a" "b" "c"]} {[] []} {["4"] ["d"]} (null)]`; got != want { + t.Fatalf("invalid column values: got=%q, want=%q", got, want) + } + + // roundtripping from JSON with array.FromJSON should work + arr := array.RecordToStructArray(rec) + defer arr.Release() + jsonStr, err := json.Marshal(arr) + assert.NoError(t, err) + + roundtripped, _, err := array.FromJSON(mem, arr.DataType(), bytes.NewReader(jsonStr)) + defer roundtripped.Release() + assert.NoError(t, err) + assert.Truef(t, array.Equal(arr, roundtripped), "JSON round trip returns different array: got=%q, want=%d", arr, roundtripped) } func TestRecordBuilderResize(t *testing.T) { diff --git a/arrow/array/struct_test.go b/arrow/array/struct_test.go index 216b353a..6027d028 100644 --- a/arrow/array/struct_test.go +++ b/arrow/array/struct_test.go @@ -513,6 +513,12 @@ func TestStructArrayUnmarshalJSONMissingFields(t *testing.T) { panic: false, want: `{[(null)] [3] {[(null)] [(null)] ["test"]}}`, }, + { + name: "explicit null in required field", + jsonInput: `[{"f2": 3, "f3": {"f3_3": null}}]`, + panic: false, + want: `{[(null)] [3] {[(null)] [(null)] [""]}}`, + }, } for _, tc := range tests { diff --git a/arrow/array/util_test.go b/arrow/array/util_test.go index eb3de6a8..84103d45 100644 --- a/arrow/array/util_test.go +++ b/arrow/array/util_test.go @@ -452,29 +452,50 @@ func TestArrRecordsJSONRoundTrip(t *testing.T) { continue } t.Run(k, func(t *testing.T) { - var buf bytes.Buffer - assert.NotPanics(t, func() { - enc := json.NewEncoder(&buf) - for _, r := range v { - if err := enc.Encode(r); err != nil { - panic(err) - } + for _, nullable := range []bool{true, false} { + var name string + if nullable { + name = "nullable" + } else { + name = "non-nullable" } - }) - - rdr := bytes.NewReader(buf.Bytes()) - var cur int64 - - mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) - defer mem.AssertSize(t, 0) - - for _, r := range v { - rec, off, err := array.RecordFromJSON(mem, r.Schema(), rdr, array.WithStartOffset(cur)) - assert.NoError(t, err) - defer rec.Release() - assert.Truef(t, array.RecordApproxEqual(r, rec), "expected: %s\ngot: %s\n", r, rec) - cur += off + t.Run(name, func(t *testing.T) { + fields := v[0].Schema().Fields() + for i := range fields { + fields[i].Nullable = nullable + } + meta := v[0].Schema().Metadata() + schema := arrow.NewSchema(fields, &meta) + + var buf bytes.Buffer + assert.NotPanics(t, func() { + enc := json.NewEncoder(&buf) + for _, rawBatch := range v { + batch := array.NewRecordBatch(schema, rawBatch.Columns(), rawBatch.NumRows()) + if err := enc.Encode(batch); err != nil { + panic(err) + } + } + }) + + rdr := bytes.NewReader(buf.Bytes()) + var cur int64 + + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + for _, rawBatch := range v { + batch := array.NewRecordBatch(schema, rawBatch.Columns(), rawBatch.NumRows()) + + rec, off, err := array.RecordFromJSON(mem, schema, rdr, array.WithStartOffset(cur)) + assert.NoError(t, err) + defer rec.Release() + + assert.Truef(t, array.RecordApproxEqual(batch, rec), "expected: %s\ngot: %s\n", batch, rec) + cur += off + } + }) } }) } From 1a2c73bdeaa70b5e97b0d293528ef6da4a7b4b15 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Tue, 28 Apr 2026 15:36:34 +0200 Subject: [PATCH 05/39] `AppendEmptyValue()` if field is nullable Modified `StructBuilder` and `RecordBuilder` to append the empty default value (usually zero) to the inner field if it's marked as nullable and the consumed value is null. --- arrow/array/record.go | 13 ++++++++++++- arrow/array/struct.go | 12 +++++++++++- internal/json/json.go | 5 +++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arrow/array/record.go b/arrow/array/record.go index b7f84180..2f8948d4 100644 --- a/arrow/array/record.go +++ b/arrow/array/record.go @@ -455,10 +455,21 @@ func (b *RecordBuilder) UnmarshalOne(dec *json.Decoder) error { } continue } + idx := indices[0] - if err := b.fields[indices[0]].UnmarshalOne(dec); err != nil { + var next json.RawMessage + if err := dec.Decode(&next); err != nil { return err } + + if json.IsNullMessage(next) && !b.schema.Field(idx).Nullable { + b.fields[idx].AppendEmptyValue() + } else { + sub := json.NewDecoder(bytes.NewReader(next)) + if err := b.fields[idx].UnmarshalOne(sub); err != nil { + return err + } + } } // consume the closing '}' diff --git a/arrow/array/struct.go b/arrow/array/struct.go index 203dc1f7..76611dca 100644 --- a/arrow/array/struct.go +++ b/arrow/array/struct.go @@ -488,9 +488,19 @@ func (b *StructBuilder) UnmarshalOne(dec *json.Decoder) error { continue } - if err := b.fields[idx].UnmarshalOne(dec); err != nil { + var next json.RawMessage + if err := dec.Decode(&next); err != nil { return err } + + if json.IsNullMessage(next) && !b.dtype.(*arrow.StructType).Field(idx).Nullable { + b.fields[idx].AppendEmptyValue() + } else { + sub := json.NewDecoder(bytes.NewReader(next)) + if err := b.fields[idx].UnmarshalOne(sub); err != nil { + return err + } + } } // Append null values to all optional fields that were not presented in the json input diff --git a/internal/json/json.go b/internal/json/json.go index b4c4c9f6..7fdd0d86 100644 --- a/internal/json/json.go +++ b/internal/json/json.go @@ -20,6 +20,7 @@ package json import ( + "bytes" "io" "github.com/goccy/go-json" @@ -49,3 +50,7 @@ func NewDecoder(r io.Reader) *Decoder { func NewEncoder(w io.Writer) *Encoder { return json.NewEncoder(w) } + +func IsNullMessage(m RawMessage) bool { + return bytes.Equal(m, []byte("null")) +} From 7a3b5ba2fafcc008a9cbfc7130c2085daac556fe Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 29 Apr 2026 10:50:43 +0200 Subject: [PATCH 06/39] Add `nullable` argument to `GetOneForMarshal()` The array implementations need a way of knowing whether to ignore the valids buffer or not. By default, it shouldn't ignore it if the array is being serialized by itself, like with `MarshalJSON()`. However, if the array is a part of a `Field` in a `RecordBatch` or `Struct`, then the value of the valids buffer might need to be ignored. This will be implemented in the next commit. --- arrow/array.go | 2 +- arrow/array/binary.go | 18 +++++++++--------- arrow/array/boolean.go | 4 ++-- arrow/array/decimal.go | 8 ++++---- arrow/array/decimal128_test.go | 2 +- arrow/array/decimal256_test.go | 2 +- arrow/array/dictionary.go | 8 ++++---- arrow/array/encoded.go | 10 +++++----- arrow/array/extension.go | 4 ++-- arrow/array/fixed_size_list.go | 6 +++--- arrow/array/fixedsize_binary.go | 4 ++-- arrow/array/float16.go | 4 ++-- arrow/array/interval.go | 16 ++++++++-------- arrow/array/list.go | 32 ++++++++++++++++---------------- arrow/array/null.go | 2 +- arrow/array/numeric_generic.go | 32 ++++++++++++++++---------------- arrow/array/string.go | 16 ++++++++-------- arrow/array/struct.go | 10 +++++----- arrow/array/timestamp.go | 6 +++--- arrow/array/union.go | 24 ++++++++++++------------ arrow/array/util.go | 2 +- arrow/extensions/bool8.go | 4 ++-- arrow/extensions/json.go | 14 +++++++++----- arrow/extensions/uuid.go | 6 +++--- arrow/extensions/variant.go | 4 ++-- 25 files changed, 122 insertions(+), 118 deletions(-) diff --git a/arrow/array.go b/arrow/array.go index d42ca6d0..891697e9 100644 --- a/arrow/array.go +++ b/arrow/array.go @@ -111,7 +111,7 @@ type Array interface { ValueStr(i int) string // Get single value to be marshalled with `json.Marshal` - GetOneForMarshal(i int) interface{} + GetOneForMarshal(i int, nullable bool) interface{} Data() ArrayData diff --git a/arrow/array/binary.go b/arrow/array/binary.go index 4ddfdbf6..ba9ce287 100644 --- a/arrow/array/binary.go +++ b/arrow/array/binary.go @@ -152,8 +152,8 @@ func (a *Binary) setData(data *Data) { } } -func (a *Binary) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *Binary) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } return a.Value(i) @@ -162,7 +162,7 @@ func (a *Binary) GetOneForMarshal(i int) interface{} { func (a *Binary) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string @@ -346,8 +346,8 @@ func (a *LargeBinary) setData(data *Data) { } } -func (a *LargeBinary) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *LargeBinary) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } return a.Value(i) @@ -356,7 +356,7 @@ func (a *LargeBinary) GetOneForMarshal(i int) interface{} { func (a *LargeBinary) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string @@ -522,8 +522,8 @@ func (a *BinaryView) ValueStr(i int) string { return base64.StdEncoding.EncodeToString(a.Value(i)) } -func (a *BinaryView) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *BinaryView) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } return a.Value(i) @@ -532,7 +532,7 @@ func (a *BinaryView) GetOneForMarshal(i int) interface{} { func (a *BinaryView) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string diff --git a/arrow/array/boolean.go b/arrow/array/boolean.go index e57dd3e6..c555536f 100644 --- a/arrow/array/boolean.go +++ b/arrow/array/boolean.go @@ -90,8 +90,8 @@ func (a *Boolean) setData(data *Data) { } } -func (a *Boolean) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *Boolean) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.Value(i) } return nil diff --git a/arrow/array/decimal.go b/arrow/array/decimal.go index b4e24f69..3227f66e 100644 --- a/arrow/array/decimal.go +++ b/arrow/array/decimal.go @@ -55,7 +55,7 @@ func (a *baseDecimal[T]) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - return a.GetOneForMarshal(i).(string) + return a.GetOneForMarshal(i, true).(string) } func (a *baseDecimal[T]) Values() []T { return a.values } @@ -89,8 +89,8 @@ func (a *baseDecimal[T]) setData(data *Data) { } } -func (a *baseDecimal[T]) GetOneForMarshal(i int) any { - if a.IsNull(i) { +func (a *baseDecimal[T]) GetOneForMarshal(i int, nullable bool) any { + if nullable && a.IsNull(i) { return nil } @@ -102,7 +102,7 @@ func (a *baseDecimal[T]) GetOneForMarshal(i int) any { func (a *baseDecimal[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) } diff --git a/arrow/array/decimal128_test.go b/arrow/array/decimal128_test.go index e642d037..dcb234a6 100644 --- a/arrow/array/decimal128_test.go +++ b/arrow/array/decimal128_test.go @@ -279,7 +279,7 @@ func TestDecimal128GetOneForMarshal(t *testing.T) { } for i := range cases { - assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i), "unexpected value at index %d", i) + assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i, true), "unexpected value at index %d", i) } } diff --git a/arrow/array/decimal256_test.go b/arrow/array/decimal256_test.go index b5674253..0d6e9f14 100644 --- a/arrow/array/decimal256_test.go +++ b/arrow/array/decimal256_test.go @@ -288,6 +288,6 @@ func TestDecimal256GetOneForMarshal(t *testing.T) { } for i := range cases { - assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i), "unexpected value at index %d", i) + assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i, true), "unexpected value at index %d", i) } } diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index 9645b788..96613104 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -287,18 +287,18 @@ func (d *Dictionary) GetValueIndex(i int) int { return -1 } -func (d *Dictionary) GetOneForMarshal(i int) interface{} { - if d.IsNull(i) { +func (d *Dictionary) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && d.IsNull(i) { return nil } vidx := d.GetValueIndex(i) - return d.Dictionary().GetOneForMarshal(vidx) + return d.Dictionary().GetOneForMarshal(vidx, nullable) } func (d *Dictionary) MarshalJSON() ([]byte, error) { vals := make([]any, d.Len()) for i := range d.Len() { - vals[i] = d.GetOneForMarshal(i) + vals[i] = d.GetOneForMarshal(i, true) } return json.Marshal(vals) } diff --git a/arrow/array/encoded.go b/arrow/array/encoded.go index 4cfc346f..bcfeded7 100644 --- a/arrow/array/encoded.go +++ b/arrow/array/encoded.go @@ -219,13 +219,13 @@ func (r *RunEndEncoded) String() string { buf.WriteByte(',') } - value := r.values.GetOneForMarshal(i) + value := r.values.GetOneForMarshal(i, true) if byts, ok := value.(json.RawMessage); ok { value = string(byts) } var runEnd int - switch e := r.ends.GetOneForMarshal(i).(type) { + switch e := r.ends.GetOneForMarshal(i, true).(type) { case int16: runEnd = int(e) - r.data.offset case int32: @@ -240,8 +240,8 @@ func (r *RunEndEncoded) String() string { return buf.String() } -func (r *RunEndEncoded) GetOneForMarshal(i int) interface{} { - return r.values.GetOneForMarshal(r.GetPhysicalIndex(i)) +func (r *RunEndEncoded) GetOneForMarshal(i int, nullable bool) interface{} { + return r.values.GetOneForMarshal(r.GetPhysicalIndex(i), nullable) } func (r *RunEndEncoded) MarshalJSON() ([]byte, error) { @@ -252,7 +252,7 @@ func (r *RunEndEncoded) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(r.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(r.GetOneForMarshal(i, true)); err != nil { return nil, err } } diff --git a/arrow/array/extension.go b/arrow/array/extension.go index 21ed3b24..48c4f03a 100644 --- a/arrow/array/extension.go +++ b/arrow/array/extension.go @@ -116,8 +116,8 @@ func (e *ExtensionArrayBase) String() string { return fmt.Sprintf("(%s)%s", e.data.dtype, e.storage) } -func (e *ExtensionArrayBase) GetOneForMarshal(i int) interface{} { - return e.storage.GetOneForMarshal(i) +func (e *ExtensionArrayBase) GetOneForMarshal(i int, nullable bool) interface{} { + return e.storage.GetOneForMarshal(i, nullable) } func (e *ExtensionArrayBase) MarshalJSON() ([]byte, error) { diff --git a/arrow/array/fixed_size_list.go b/arrow/array/fixed_size_list.go index 4b3ff286..10dce4c7 100644 --- a/arrow/array/fixed_size_list.go +++ b/arrow/array/fixed_size_list.go @@ -51,7 +51,7 @@ func (a *FixedSizeList) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - return string(a.GetOneForMarshal(i).(json.RawMessage)) + return string(a.GetOneForMarshal(i, true).(json.RawMessage)) } func (a *FixedSizeList) String() string { @@ -123,8 +123,8 @@ func (a *FixedSizeList) Release() { a.values.Release() } -func (a *FixedSizeList) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *FixedSizeList) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } slice := a.newListValue(i) diff --git a/arrow/array/fixedsize_binary.go b/arrow/array/fixedsize_binary.go index 44ca4488..b9cf2bfd 100644 --- a/arrow/array/fixedsize_binary.go +++ b/arrow/array/fixedsize_binary.go @@ -86,8 +86,8 @@ func (a *FixedSizeBinary) setData(data *Data) { } } -func (a *FixedSizeBinary) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *FixedSizeBinary) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } diff --git a/arrow/array/float16.go b/arrow/array/float16.go index 41276803..8536df40 100644 --- a/arrow/array/float16.go +++ b/arrow/array/float16.go @@ -77,8 +77,8 @@ func (a *Float16) setData(data *Data) { } } -func (a *Float16) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *Float16) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.values[i].Float32() } return nil diff --git a/arrow/array/interval.go b/arrow/array/interval.go index 128cdfad..b2aad56f 100644 --- a/arrow/array/interval.go +++ b/arrow/array/interval.go @@ -94,8 +94,8 @@ func (a *MonthInterval) setData(data *Data) { } } -func (a *MonthInterval) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *MonthInterval) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.values[i] } return nil @@ -361,7 +361,7 @@ func (a *DayTimeInterval) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - data, err := json.Marshal(a.GetOneForMarshal(i)) + data, err := json.Marshal(a.GetOneForMarshal(i, true)) if err != nil { panic(err) } @@ -399,8 +399,8 @@ func (a *DayTimeInterval) setData(data *Data) { } } -func (a *DayTimeInterval) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *DayTimeInterval) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.values[i] } return nil @@ -663,7 +663,7 @@ func (a *MonthDayNanoInterval) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - data, err := json.Marshal(a.GetOneForMarshal(i)) + data, err := json.Marshal(a.GetOneForMarshal(i, true)) if err != nil { panic(err) } @@ -703,8 +703,8 @@ func (a *MonthDayNanoInterval) setData(data *Data) { } } -func (a *MonthDayNanoInterval) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *MonthDayNanoInterval) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.values[i] } return nil diff --git a/arrow/array/list.go b/arrow/array/list.go index df45223f..16eac56f 100644 --- a/arrow/array/list.go +++ b/arrow/array/list.go @@ -61,7 +61,7 @@ func (a *List) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i).(json.RawMessage)) + return string(a.GetOneForMarshal(i, true).(json.RawMessage)) } func (a *List) String() string { @@ -98,8 +98,8 @@ func (a *List) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *List) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *List) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } @@ -121,7 +121,7 @@ func (a *List) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { return nil, err } } @@ -193,7 +193,7 @@ func (a *LargeList) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i).(json.RawMessage)) + return string(a.GetOneForMarshal(i, true).(json.RawMessage)) } func (a *LargeList) String() string { @@ -230,8 +230,8 @@ func (a *LargeList) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *LargeList) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *LargeList) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } @@ -253,7 +253,7 @@ func (a *LargeList) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { return nil, err } } @@ -664,7 +664,7 @@ func (a *ListView) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i).(json.RawMessage)) + return string(a.GetOneForMarshal(i, true).(json.RawMessage)) } func (a *ListView) String() string { @@ -705,8 +705,8 @@ func (a *ListView) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *ListView) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *ListView) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } @@ -728,7 +728,7 @@ func (a *ListView) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { return nil, err } } @@ -811,7 +811,7 @@ func (a *LargeListView) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i).(json.RawMessage)) + return string(a.GetOneForMarshal(i, true).(json.RawMessage)) } func (a *LargeListView) String() string { @@ -852,8 +852,8 @@ func (a *LargeListView) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *LargeListView) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *LargeListView) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } @@ -875,7 +875,7 @@ func (a *LargeListView) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { return nil, err } } diff --git a/arrow/array/null.go b/arrow/array/null.go index 8f8f5805..01001776 100644 --- a/arrow/array/null.go +++ b/arrow/array/null.go @@ -80,7 +80,7 @@ func (a *Null) setData(data *Data) { a.data.nulls = a.data.length } -func (a *Null) GetOneForMarshal(i int) interface{} { +func (a *Null) GetOneForMarshal(i int, nullable bool) interface{} { return nil } diff --git a/arrow/array/numeric_generic.go b/arrow/array/numeric_generic.go index 49a369cd..c7616cf2 100644 --- a/arrow/array/numeric_generic.go +++ b/arrow/array/numeric_generic.go @@ -83,8 +83,8 @@ func (a *numericArray[T]) ValueStr(i int) string { return fmt.Sprintf("%v", a.values[i]) } -func (a *numericArray[T]) GetOneForMarshal(i int) any { - if a.IsNull(i) { +func (a *numericArray[T]) GetOneForMarshal(i int, nullable bool) any { + if nullable && a.IsNull(i) { return nil } @@ -107,8 +107,8 @@ type oneByteArrs[T int8 | uint8] struct { numericArray[T] } -func (a *oneByteArrs[T]) GetOneForMarshal(i int) any { - if a.IsNull(i) { +func (a *oneByteArrs[T]) GetOneForMarshal(i int, nullable bool) any { + if nullable && a.IsNull(i) { return nil } @@ -141,8 +141,8 @@ func (a *floatArray[T]) ValueStr(i int) string { return strconv.FormatFloat(float64(a.Value(i)), 'g', -1, bitWidth) } -func (a *floatArray[T]) GetOneForMarshal(i int) any { - if a.IsNull(i) { +func (a *floatArray[T]) GetOneForMarshal(i int, nullable bool) any { + if nullable && a.IsNull(i) { return nil } @@ -160,7 +160,7 @@ func (a *floatArray[T]) GetOneForMarshal(i int) any { func (a *floatArray[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) } @@ -176,7 +176,7 @@ type dateArray[T interface { func (d *dateArray[T]) MarshalJSON() ([]byte, error) { vals := make([]any, d.Len()) for i := range d.values { - vals[i] = d.GetOneForMarshal(i) + vals[i] = d.GetOneForMarshal(i, true) } return json.Marshal(vals) } @@ -189,8 +189,8 @@ func (d *dateArray[T]) ValueStr(i int) string { return d.values[i].FormattedString() } -func (d *dateArray[T]) GetOneForMarshal(i int) interface{} { - if d.IsNull(i) { +func (d *dateArray[T]) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && d.IsNull(i) { return nil } @@ -212,7 +212,7 @@ type timeArray[T interface { func (a *timeArray[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) } @@ -225,8 +225,8 @@ func (a *timeArray[T]) ValueStr(i int) string { return a.values[i].FormattedString(a.DataType().(timeType).TimeUnit()) } -func (a *timeArray[T]) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *timeArray[T]) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } @@ -248,7 +248,7 @@ func (a *Duration) DurationValues() []arrow.Duration { return a.Values() } func (a *Duration) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) } @@ -261,8 +261,8 @@ func (a *Duration) ValueStr(i int) string { return fmt.Sprintf("%d%s", a.values[i], a.DataType().(timeType).TimeUnit()) } -func (a *Duration) GetOneForMarshal(i int) any { - if a.IsNull(i) { +func (a *Duration) GetOneForMarshal(i int, nullable bool) any { + if nullable && a.IsNull(i) { return nil } return fmt.Sprintf("%d%s", a.values[i], a.DataType().(timeType).TimeUnit()) diff --git a/arrow/array/string.go b/arrow/array/string.go index accc6e95..0133f7a9 100644 --- a/arrow/array/string.go +++ b/arrow/array/string.go @@ -154,8 +154,8 @@ func (a *String) setData(data *Data) { } } -func (a *String) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *String) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.Value(i) } return nil @@ -362,8 +362,8 @@ func (a *LargeString) setData(data *Data) { } } -func (a *LargeString) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *LargeString) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.Value(i) } return nil @@ -372,7 +372,7 @@ func (a *LargeString) GetOneForMarshal(i int) interface{} { func (a *LargeString) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) } @@ -526,8 +526,8 @@ func (a *StringView) ValueStr(i int) string { return a.Value(i) } -func (a *StringView) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *StringView) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } return a.Value(i) @@ -536,7 +536,7 @@ func (a *StringView) GetOneForMarshal(i int) interface{} { func (a *StringView) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) } diff --git a/arrow/array/struct.go b/arrow/array/struct.go index 76611dca..5feb5fae 100644 --- a/arrow/array/struct.go +++ b/arrow/array/struct.go @@ -130,7 +130,7 @@ func (a *Struct) ValueStr(i int) string { return NullValueStr } - data, err := json.Marshal(a.GetOneForMarshal(i)) + data, err := json.Marshal(a.GetOneForMarshal(i, true)) if err != nil { panic(err) } @@ -209,15 +209,15 @@ func (a *Struct) setData(data *Data) { } } -func (a *Struct) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *Struct) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } tmp := make(map[string]interface{}) fieldList := a.data.dtype.(*arrow.StructType).Fields() for j, d := range a.fields { - tmp[fieldList[j].Name] = d.GetOneForMarshal(i) + tmp[fieldList[j].Name] = d.GetOneForMarshal(i, true) } return tmp } @@ -231,7 +231,7 @@ func (a *Struct) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { return nil, err } } diff --git a/arrow/array/timestamp.go b/arrow/array/timestamp.go index 55e9b535..613941f7 100644 --- a/arrow/array/timestamp.go +++ b/arrow/array/timestamp.go @@ -109,8 +109,8 @@ func (a *Timestamp) ValueStr(i int) string { return toTime(a.values[i]).Format(layout) } -func (a *Timestamp) GetOneForMarshal(i int) interface{} { - if val := a.ValueStr(i); val != NullValueStr { +func (a *Timestamp) GetOneForMarshal(i int, nullable bool) interface{} { + if val := a.ValueStr(i); !nullable || val != NullValueStr { return val } return nil @@ -119,7 +119,7 @@ func (a *Timestamp) GetOneForMarshal(i int) interface{} { func (a *Timestamp) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) diff --git a/arrow/array/union.go b/arrow/array/union.go index 58524a76..1d40ad66 100644 --- a/arrow/array/union.go +++ b/arrow/array/union.go @@ -320,17 +320,17 @@ func (a *SparseUnion) setData(data *Data) { debug.Assert(a.data.buffers[0] == nil, "arrow/array: validity bitmap for sparse unions should be nil") } -func (a *SparseUnion) GetOneForMarshal(i int) interface{} { +func (a *SparseUnion) GetOneForMarshal(i int, nullable bool) interface{} { typeID := a.RawTypeCodes()[i] childID := a.ChildID(i) data := a.Field(childID) - if data.IsNull(i) { + if nullable && data.IsNull(i) { return nil } - return []interface{}{typeID, data.GetOneForMarshal(i)} + return []interface{}{typeID, data.GetOneForMarshal(i, nullable)} } func (a *SparseUnion) MarshalJSON() ([]byte, error) { @@ -342,7 +342,7 @@ func (a *SparseUnion) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { return nil, err } } @@ -355,7 +355,7 @@ func (a *SparseUnion) ValueStr(i int) string { return NullValueStr } - val := a.GetOneForMarshal(i) + val := a.GetOneForMarshal(i, true) if val == nil { // child is nil return NullValueStr @@ -380,7 +380,7 @@ func (a *SparseUnion) String() string { field := fieldList[a.ChildID(i)] f := a.Field(a.ChildID(i)) - fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(i)) + fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(i, true)) } b.WriteByte(']') return b.String() @@ -613,18 +613,18 @@ func (a *DenseUnion) setData(data *Data) { } } -func (a *DenseUnion) GetOneForMarshal(i int) interface{} { +func (a *DenseUnion) GetOneForMarshal(i int, nullable bool) interface{} { typeID := a.RawTypeCodes()[i] childID := a.ChildID(i) data := a.Field(childID) offset := int(a.RawValueOffsets()[i]) - if data.IsNull(offset) { + if nullable && data.IsNull(offset) { return nil } - return []interface{}{typeID, data.GetOneForMarshal(offset)} + return []interface{}{typeID, data.GetOneForMarshal(offset, nullable)} } func (a *DenseUnion) MarshalJSON() ([]byte, error) { @@ -636,7 +636,7 @@ func (a *DenseUnion) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { return nil, err } } @@ -649,7 +649,7 @@ func (a *DenseUnion) ValueStr(i int) string { return NullValueStr } - val := a.GetOneForMarshal(i) + val := a.GetOneForMarshal(i, true) if val == nil { // child in nil return NullValueStr @@ -676,7 +676,7 @@ func (a *DenseUnion) String() string { field := fieldList[a.ChildID(i)] f := a.Field(a.ChildID(i)) - fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(int(offsets[i]))) + fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(int(offsets[i]), true)) } b.WriteByte(']') return b.String() diff --git a/arrow/array/util.go b/arrow/array/util.go index 136ed353..8fc61a6c 100644 --- a/arrow/array/util.go +++ b/arrow/array/util.go @@ -284,7 +284,7 @@ func RecordToJSON(rec arrow.RecordBatch, w io.Writer) error { cols := make(map[string]interface{}) for i := 0; int64(i) < rec.NumRows(); i++ { for j, c := range rec.Columns() { - cols[fields[j].Name] = c.GetOneForMarshal(i) + cols[fields[j].Name] = c.GetOneForMarshal(i, true) } if err := enc.Encode(cols); err != nil { return err diff --git a/arrow/extensions/bool8.go b/arrow/extensions/bool8.go index 97038a1b..aaf51f0c 100644 --- a/arrow/extensions/bool8.go +++ b/arrow/extensions/bool8.go @@ -114,8 +114,8 @@ func (a *Bool8Array) MarshalJSON() ([]byte, error) { return json.Marshal(values) } -func (a *Bool8Array) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { +func (a *Bool8Array) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { return nil } return a.Value(i) diff --git a/arrow/extensions/json.go b/arrow/extensions/json.go index 3f46b50e..b9cc50a7 100644 --- a/arrow/extensions/json.go +++ b/arrow/extensions/json.go @@ -116,9 +116,7 @@ func (a *JSONArray) ValueBytes(i int) []byte { return b } -// ValueJSON wraps the underlying string value as a json.RawMessage, -// or returns nil if the array value is null. -func (a *JSONArray) ValueJSON(i int) json.RawMessage { +func (a *JSONArray) valueJSON(i int, nullable bool) json.RawMessage { var val json.RawMessage if a.IsValid(i) { val = json.RawMessage(a.Storage().(array.StringLike).Value(i)) @@ -126,6 +124,12 @@ func (a *JSONArray) ValueJSON(i int) json.RawMessage { return val } +// ValueJSON wraps the underlying string value as a json.RawMessage, +// or returns nil if the array value is null. +func (a *JSONArray) ValueJSON(i int) json.RawMessage { + return a.valueJSON(i, true) +} + // MarshalJSON implements json.Marshaler. // Marshaling json.RawMessage is a no-op, except that nil values will // be marshaled as a JSON null. @@ -138,8 +142,8 @@ func (a *JSONArray) MarshalJSON() ([]byte, error) { } // GetOneForMarshal implements arrow.Array. -func (a *JSONArray) GetOneForMarshal(i int) interface{} { - return a.ValueJSON(i) +func (a *JSONArray) GetOneForMarshal(i int, nullable bool) interface{} { + return a.valueJSON(i, nullable) } var ( diff --git a/arrow/extensions/uuid.go b/arrow/extensions/uuid.go index 9aac0225..cbe56ecd 100644 --- a/arrow/extensions/uuid.go +++ b/arrow/extensions/uuid.go @@ -190,13 +190,13 @@ func (a *UUIDArray) ValueStr(i int) string { func (a *UUIDArray) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range vals { - vals[i] = a.GetOneForMarshal(i) + vals[i] = a.GetOneForMarshal(i, true) } return json.Marshal(vals) } -func (a *UUIDArray) GetOneForMarshal(i int) interface{} { - if a.IsValid(i) { +func (a *UUIDArray) GetOneForMarshal(i int, nullable bool) interface{} { + if !nullable || a.IsValid(i) { return a.Value(i) } return nil diff --git a/arrow/extensions/variant.go b/arrow/extensions/variant.go index 379822c4..1f41ef07 100644 --- a/arrow/extensions/variant.go +++ b/arrow/extensions/variant.go @@ -599,8 +599,8 @@ func (v *VariantArray) MarshalJSON() ([]byte, error) { return json.Marshal(values) } -func (v *VariantArray) GetOneForMarshal(i int) any { - if v.IsNull(i) { +func (v *VariantArray) GetOneForMarshal(i int, nullable bool) any { + if nullable && v.IsNull(i) { return nil } From 572e1ade70efbb8c650f21b90d558b07f6cc5c15 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 29 Apr 2026 18:24:30 +0200 Subject: [PATCH 07/39] Make struct and record use `field.Nullable` when serializing --- arrow/array/struct.go | 5 +++-- arrow/array/util.go | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arrow/array/struct.go b/arrow/array/struct.go index 5feb5fae..63ccbf59 100644 --- a/arrow/array/struct.go +++ b/arrow/array/struct.go @@ -215,9 +215,10 @@ func (a *Struct) GetOneForMarshal(i int, nullable bool) interface{} { } tmp := make(map[string]interface{}) - fieldList := a.data.dtype.(*arrow.StructType).Fields() + dtype := a.data.dtype.(*arrow.StructType) + fieldList := dtype.Fields() for j, d := range a.fields { - tmp[fieldList[j].Name] = d.GetOneForMarshal(i, true) + tmp[fieldList[j].Name] = d.GetOneForMarshal(i, dtype.Field(j).Nullable) } return tmp } diff --git a/arrow/array/util.go b/arrow/array/util.go index 8fc61a6c..b3375152 100644 --- a/arrow/array/util.go +++ b/arrow/array/util.go @@ -284,7 +284,7 @@ func RecordToJSON(rec arrow.RecordBatch, w io.Writer) error { cols := make(map[string]interface{}) for i := 0; int64(i) < rec.NumRows(); i++ { for j, c := range rec.Columns() { - cols[fields[j].Name] = c.GetOneForMarshal(i, true) + cols[fields[j].Name] = c.GetOneForMarshal(i, rec.Schema().Field(j).Nullable) } if err := enc.Encode(cols); err != nil { return err From 0c879e60b0f5a9f657320bc3e1c5584e1636794c Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 29 Apr 2026 18:25:58 +0200 Subject: [PATCH 08/39] Fix tests that were implicitly depending on wrong nullable semantics This commit fixes some tests that were implicitly setting `valid=false` on non-nullable fields, which now causes legitimate test failures when roundtripping to JSON. --- arrow/compute/vector_sort_test.go | 46 +++++++++++++------------- arrow/extensions/uuid_test.go | 2 +- arrow/internal/arrdata/arrdata.go | 46 +++++++++++++------------- arrow/internal/arrjson/arrjson_test.go | 4 +-- arrow/ipc/cmd/arrow-ls/main_test.go | 6 ++-- 5 files changed, 52 insertions(+), 52 deletions(-) diff --git a/arrow/compute/vector_sort_test.go b/arrow/compute/vector_sort_test.go index 39bf5e95..5a15428e 100644 --- a/arrow/compute/vector_sort_test.go +++ b/arrow/compute/vector_sort_test.go @@ -1349,8 +1349,8 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { t.Run("NoNull", func(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.PrimitiveTypes.Uint8}, - {Name: "b", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "a", Type: arrow.PrimitiveTypes.Uint8, Nullable: true}, + {Name: "b", Type: arrow.PrimitiveTypes.Uint32, Nullable: true}, }, nil) jsonRows := `[ {"a": 3, "b": 5}, @@ -1373,8 +1373,8 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { t.Run("Null", func(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.PrimitiveTypes.Uint8}, - {Name: "b", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "a", Type: arrow.PrimitiveTypes.Uint8, Nullable: true}, + {Name: "b", Type: arrow.PrimitiveTypes.Uint32, Nullable: true}, }, nil) jsonRows := `[ {"a": null, "b": 5}, @@ -1396,8 +1396,8 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { t.Run("NaN", func(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.PrimitiveTypes.Float32}, - {Name: "b", Type: arrow.PrimitiveTypes.Float64}, + {Name: "a", Type: arrow.PrimitiveTypes.Float32, Nullable: true}, + {Name: "b", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, }, nil) ba := array.NewFloat32Builder(mem) defer ba.Release() @@ -1426,8 +1426,8 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { t.Run("NaNAndNull", func(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.PrimitiveTypes.Float32}, - {Name: "b", Type: arrow.PrimitiveTypes.Float64}, + {Name: "a", Type: arrow.PrimitiveTypes.Float32, Nullable: true}, + {Name: "b", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, }, nil) ba := array.NewFloat32Builder(mem) defer ba.Release() @@ -1460,8 +1460,8 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { t.Run("Boolean", func(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.FixedWidthTypes.Boolean}, - {Name: "b", Type: arrow.FixedWidthTypes.Boolean}, + {Name: "a", Type: arrow.FixedWidthTypes.Boolean, Nullable: true}, + {Name: "b", Type: arrow.FixedWidthTypes.Boolean, Nullable: true}, }, nil) jsonRows := `[ {"a": true, "b": null}, @@ -1486,9 +1486,9 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { ts := &arrow.TimestampType{Unit: arrow.Microsecond} fsb3 := &arrow.FixedSizeBinaryType{ByteWidth: 3} schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: ts}, - {Name: "b", Type: arrow.BinaryTypes.LargeString}, - {Name: "c", Type: fsb3}, + {Name: "a", Type: ts, Nullable: true}, + {Name: "b", Type: arrow.BinaryTypes.LargeString, Nullable: true}, + {Name: "c", Type: fsb3, Nullable: true}, }, nil) ba := array.NewTimestampBuilder(mem, ts) defer ba.Release() @@ -1535,8 +1535,8 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { d128 := &arrow.Decimal128Type{Precision: 3, Scale: 1} d256 := &arrow.Decimal256Type{Precision: 4, Scale: 2} schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: d128}, - {Name: "b", Type: d256}, + {Name: "a", Type: d128, Nullable: true}, + {Name: "b", Type: d256, Nullable: true}, }, nil) jsonRows := `[ {"a": "12.3", "b": "12.34"}, @@ -1561,8 +1561,8 @@ func TestVectorSortIndicesCppRecordBatchParity(t *testing.T) { t.Run("DuplicateSortKeys", func(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.PrimitiveTypes.Float32}, - {Name: "b", Type: arrow.PrimitiveTypes.Float64}, + {Name: "a", Type: arrow.PrimitiveTypes.Float32, Nullable: true}, + {Name: "b", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, }, nil) ba := array.NewFloat32Builder(mem) defer ba.Release() @@ -1610,8 +1610,8 @@ func TestVectorSortIndicesCppTableParity(t *testing.T) { ctx := context.Background() schemaAB := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.PrimitiveTypes.Uint8}, - {Name: "b", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "a", Type: arrow.PrimitiveTypes.Uint8, Nullable: true}, + {Name: "b", Type: arrow.PrimitiveTypes.Uint32, Nullable: true}, }, nil) t.Run("EmptyTable", func(t *testing.T) { @@ -1667,8 +1667,8 @@ func TestVectorSortIndicesCppTableParity(t *testing.T) { t.Run("BinaryLikeTwoChunks", func(t *testing.T) { fsb3 := &arrow.FixedSizeBinaryType{ByteWidth: 3} s := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.BinaryTypes.LargeString}, - {Name: "b", Type: fsb3}, + {Name: "a", Type: arrow.BinaryTypes.LargeString, Nullable: true}, + {Name: "b", Type: fsb3, Nullable: true}, }, nil) buildBatch := func(a []string, b [][]byte, bNulls []bool) arrow.RecordBatch { ab := array.NewLargeStringBuilder(mem) @@ -1719,8 +1719,8 @@ func TestVectorSortIndicesCppTableParity(t *testing.T) { t.Run("HeterogenousChunking", func(t *testing.T) { s := arrow.NewSchema([]arrow.Field{ - {Name: "a", Type: arrow.PrimitiveTypes.Float32}, - {Name: "b", Type: arrow.PrimitiveTypes.Float64}, + {Name: "a", Type: arrow.PrimitiveTypes.Float32, Nullable: true}, + {Name: "b", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, }, nil) a0, _, err := array.FromJSON(mem, arrow.PrimitiveTypes.Float32, strings.NewReader("[null, 1]")) require.NoError(t, err) diff --git a/arrow/extensions/uuid_test.go b/arrow/extensions/uuid_test.go index a76b77a9..36bb812b 100644 --- a/arrow/extensions/uuid_test.go +++ b/arrow/extensions/uuid_test.go @@ -62,7 +62,7 @@ func TestUUIDExtensionBuilder(t *testing.T) { func TestUUIDExtensionRecordBuilder(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ - {Name: "uuid", Type: extensions.NewUUIDType()}, + {Name: "uuid", Type: extensions.NewUUIDType(), Nullable: true}, }, nil) builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) builder.Field(0).(*extensions.UUIDBuilder).Append(testUUID) diff --git a/arrow/internal/arrdata/arrdata.go b/arrow/internal/arrdata/arrdata.go index 095571a8..d95ee7cc 100644 --- a/arrow/internal/arrdata/arrdata.go +++ b/arrow/internal/arrdata/arrdata.go @@ -192,59 +192,59 @@ func makeStructsRecords() []arrow.RecordBatch { mem := memory.NewGoAllocator() fields := []arrow.Field{ - {Name: "f1", Type: arrow.PrimitiveTypes.Int32}, - {Name: "f2", Type: arrow.BinaryTypes.String}, + {Name: "f1", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, + {Name: "f2", Type: arrow.BinaryTypes.String, Nullable: true}, } dtype := arrow.StructOf(fields...) schema := arrow.NewSchema([]arrow.Field{{Name: "struct_nullable", Type: dtype, Nullable: true}}, nil) - mask := []bool{true, false, false, true, true, true, false, true} + innerValids := []bool{true, false, false, true, true} chunks := [][]arrow.Array{ { structOf(mem, dtype, [][]arrow.Array{ { - arrayOf(mem, []int32{-1, -2, -3, -4, -5}, mask[:5]), - arrayOf(mem, []string{"111", "222", "333", "444", "555"}, mask[:5]), + arrayOf(mem, []int32{-1, -2, -3, -4, -5}, innerValids), + arrayOf(mem, []string{"111", "222", "333", "444", "555"}, innerValids), }, { - arrayOf(mem, []int32{-11, -12, -13, -14, -15}, mask[:5]), - arrayOf(mem, []string{"1111", "1222", "1333", "1444", "1555"}, mask[:5]), + arrayOf(mem, []int32{-11, -12, -13, -14, -15}, innerValids), + arrayOf(mem, []string{"1111", "1222", "1333", "1444", "1555"}, innerValids), }, { - arrayOf(mem, []int32{-21, -22, -23, -24, -25}, mask[:5]), - arrayOf(mem, []string{"2111", "2222", "2333", "2444", "2555"}, mask[:5]), + arrayOf(mem, []int32{-21, -22, -23, -24, -25}, innerValids), + arrayOf(mem, []string{"2111", "2222", "2333", "2444", "2555"}, innerValids), }, { - arrayOf(mem, []int32{-31, -32, -33, -34, -35}, mask[:5]), - arrayOf(mem, []string{"3111", "3222", "3333", "3444", "3555"}, mask[:5]), + arrayOf(mem, []int32{-31, -32, -33, -34, -35}, innerValids), + arrayOf(mem, []string{"3111", "3222", "3333", "3444", "3555"}, innerValids), }, { - arrayOf(mem, []int32{-41, -42, -43, -44, -45}, mask[:5]), - arrayOf(mem, []string{"4111", "4222", "4333", "4444", "4555"}, mask[:5]), + arrayOf(mem, []int32{-41, -42, -43, -44, -45}, innerValids), + arrayOf(mem, []string{"4111", "4222", "4333", "4444", "4555"}, innerValids), }, }, []bool{true, false, true, true, true}), }, { structOf(mem, dtype, [][]arrow.Array{ { - arrayOf(mem, []int32{1, 2, 3, 4, 5}, mask[:5]), - arrayOf(mem, []string{"-111", "-222", "-333", "-444", "-555"}, mask[:5]), + arrayOf(mem, []int32{1, 2, 3, 4, 5}, innerValids), + arrayOf(mem, []string{"-111", "-222", "-333", "-444", "-555"}, innerValids), }, { - arrayOf(mem, []int32{11, 12, 13, 14, 15}, mask[:5]), - arrayOf(mem, []string{"-1111", "-1222", "-1333", "-1444", "-1555"}, mask[:5]), + arrayOf(mem, []int32{11, 12, 13, 14, 15}, innerValids), + arrayOf(mem, []string{"-1111", "-1222", "-1333", "-1444", "-1555"}, innerValids), }, { - arrayOf(mem, []int32{21, 22, 23, 24, 25}, mask[:5]), - arrayOf(mem, []string{"-2111", "-2222", "-2333", "-2444", "-2555"}, mask[:5]), + arrayOf(mem, []int32{21, 22, 23, 24, 25}, innerValids), + arrayOf(mem, []string{"-2111", "-2222", "-2333", "-2444", "-2555"}, innerValids), }, { - arrayOf(mem, []int32{31, 32, 33, 34, 35}, mask[:5]), - arrayOf(mem, []string{"-3111", "-3222", "-3333", "-3444", "-3555"}, mask[:5]), + arrayOf(mem, []int32{31, 32, 33, 34, 35}, innerValids), + arrayOf(mem, []string{"-3111", "-3222", "-3333", "-3444", "-3555"}, innerValids), }, { - arrayOf(mem, []int32{41, 42, 43, 44, 45}, mask[:5]), - arrayOf(mem, []string{"-4111", "-4222", "-4333", "-4444", "-4555"}, mask[:5]), + arrayOf(mem, []int32{41, 42, 43, 44, 45}, innerValids), + arrayOf(mem, []string{"-4111", "-4222", "-4333", "-4444", "-4555"}, innerValids), }, }, []bool{true, false, false, true, true}), }, diff --git a/arrow/internal/arrjson/arrjson_test.go b/arrow/internal/arrjson/arrjson_test.go index 7e2f386f..faeecfc0 100644 --- a/arrow/internal/arrjson/arrjson_test.go +++ b/arrow/internal/arrjson/arrjson_test.go @@ -948,7 +948,7 @@ func makeStructsWantJSONs() string { "isSigned": true, "bitWidth": 32 }, - "nullable": false, + "nullable": true, "children": [] }, { @@ -956,7 +956,7 @@ func makeStructsWantJSONs() string { "type": { "name": "utf8" }, - "nullable": false, + "nullable": true, "children": [] } ] diff --git a/arrow/ipc/cmd/arrow-ls/main_test.go b/arrow/ipc/cmd/arrow-ls/main_test.go index f90e4a80..0f3b5377 100644 --- a/arrow/ipc/cmd/arrow-ls/main_test.go +++ b/arrow/ipc/cmd/arrow-ls/main_test.go @@ -59,7 +59,7 @@ records: 3 name: "structs", want: `schema: fields: 1 - - struct_nullable: type=struct, nullable + - struct_nullable: type=struct, nullable records: 2 `, }, @@ -221,7 +221,7 @@ records: 3 name: "structs", want: `schema: fields: 1 - - struct_nullable: type=struct, nullable + - struct_nullable: type=struct, nullable records: 2 `, }, @@ -230,7 +230,7 @@ records: 2 want: `version: V5 schema: fields: 1 - - struct_nullable: type=struct, nullable + - struct_nullable: type=struct, nullable records: 2 `, }, From 8ead25e53dd45c2cf9adf7584074710461d37453 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Tue, 28 Oct 2025 13:47:35 +0100 Subject: [PATCH 09/39] Add `TimestampWithOffset` extension type This commit adds a new `TimestampWithOffset` extension type that can be used to represent timestamps with per-row timezone information. It stores information in a `struct` with 2 fields, `timestamp=[T, "UTC"]`, where `T` can be any `arrow.TimeUnit` and `offset_minutes=int16`, which represents the offset in minutes from the UTC timestamp. --- arrow/extensions/extensions.go | 7 +- arrow/extensions/timestamp_with_offset.go | 354 ++++++++++++++++++ .../extensions/timestamp_with_offset_test.go | 209 +++++++++++ 3 files changed, 567 insertions(+), 3 deletions(-) create mode 100644 arrow/extensions/timestamp_with_offset.go create mode 100644 arrow/extensions/timestamp_with_offset_test.go diff --git a/arrow/extensions/extensions.go b/arrow/extensions/extensions.go index 04566c75..6f13aa64 100644 --- a/arrow/extensions/extensions.go +++ b/arrow/extensions/extensions.go @@ -21,11 +21,12 @@ import ( ) var canonicalExtensionTypes = []arrow.ExtensionType{ - NewBool8Type(), - NewUUIDType(), - &OpaqueType{}, &JSONType{}, + &OpaqueType{}, + &TimestampWithOffsetType{}, &VariantType{}, + NewBool8Type(), + NewUUIDType(), } func init() { diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go new file mode 100644 index 00000000..9e452fe6 --- /dev/null +++ b/arrow/extensions/timestamp_with_offset.go @@ -0,0 +1,354 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package extensions + +import ( + _ "bytes" + "fmt" + "reflect" + "strings" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/apache/arrow-go/v18/internal/json" +) + +// TimestampWithOffsetType represents a timestamp column that stores a timezone offset per row instead of +// applying the same timezone offset to the entire column. +type TimestampWithOffsetType struct { + arrow.ExtensionBase +} + +// Whether the storageType is compatible with TimestampWithOffset. +// +// Returns (time_unit, ok). If ok is false, time unit is garbage. +func isDataTypeCompatible(storageType arrow.DataType) (arrow.TimeUnit, bool) { + timeUnit := arrow.Second + switch t := storageType.(type) { + case *arrow.StructType: + if t.NumFields() != 2 { + return timeUnit, false + } + + maybeTimestamp := t.Field(0); + maybeOffset := t.Field(1); + + timestampOk := false + switch timestampType := maybeTimestamp.Type.(type) { + case *arrow.TimestampType: + if timestampType.TimeZone == "UTC" { + timestampOk = true + timeUnit = timestampType.TimeUnit() + } + default: + } + + ok := maybeTimestamp.Name == "timestamp" && + timestampOk && + !maybeTimestamp.Nullable && + maybeOffset.Name == "offset_minutes" && + arrow.TypeEqual(maybeOffset.Type, arrow.PrimitiveTypes.Int16) && + !maybeOffset.Nullable + + return timeUnit, ok + default: + return timeUnit, false + } +} + + +// NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Int16), where T is any TimeUnit. +func NewTimestampWithOffsetType(unit arrow.TimeUnit) *TimestampWithOffsetType { + return &TimestampWithOffsetType{ + ExtensionBase: arrow.ExtensionBase{ + Storage: arrow.StructOf( + arrow.Field{ + Name: "timestamp", + Type: &arrow.TimestampType{ + Unit: unit, + TimeZone: "UTC", + }, + Nullable: false, + }, + arrow.Field{ + Name: "offset_minutes", + Type: arrow.PrimitiveTypes.Int16, + Nullable: false, + }, + ), + }, + } +} + +func (b *TimestampWithOffsetType) ArrayType() reflect.Type { return reflect.TypeOf(TimestampWithOffsetArray{}) } + +func (b *TimestampWithOffsetType) ExtensionName() string { return "arrow.timestamp_with_offset" } + +func (b *TimestampWithOffsetType) String() string { return fmt.Sprintf("extension<%s>", b.ExtensionName()) } + +func (e *TimestampWithOffsetType) MarshalJSON() ([]byte, error) { + return []byte(fmt.Sprintf(`{"name":"%s","metadata":%s}`, e.ExtensionName(), e.Serialize())), nil +} + +func (b *TimestampWithOffsetType) Serialize() string { return "" } + +func (b *TimestampWithOffsetType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { + timeUnit, ok := isDataTypeCompatible(storageType) + if !ok { + return nil, fmt.Errorf("invalid storage type for TimestampWithOffsetType: %s", storageType.Name()) + } + return NewTimestampWithOffsetType(timeUnit), nil +} + +func (b *TimestampWithOffsetType) ExtensionEquals(other arrow.ExtensionType) bool { + return b.ExtensionName() == other.ExtensionName() +} + +func (b *TimestampWithOffsetType) TimeUnit() arrow.TimeUnit { + return b.ExtensionBase.Storage.(*arrow.StructType).Fields()[0].Type.(*arrow.TimestampType).TimeUnit() +} + +func (b *TimestampWithOffsetType) NewBuilder(mem memory.Allocator) array.Builder { + return NewTimestampWithOffsetBuilder(mem, b.TimeUnit()) +} + +// TimestampWithOffsetArray is a simple array of struct +type TimestampWithOffsetArray struct { + array.ExtensionArrayBase +} + +func (a *TimestampWithOffsetArray) String() string { + var o strings.Builder + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + switch { + case a.IsNull(i): + o.WriteString(array.NullValueStr) + default: + fmt.Fprintf(&o, "\"%s\"", a.Value(i)) + } + } + o.WriteString("]") + return o.String() +} + +func timeFromFieldValues(utcTimestamp arrow.Timestamp, offsetMinutes int16, unit arrow.TimeUnit) time.Time { + hours := offsetMinutes / 60; + minutes := offsetMinutes % 60 + if minutes < 0 { + minutes = -minutes + } + + loc := time.FixedZone(fmt.Sprintf("UTC%+03d:%02d", hours, minutes), int(offsetMinutes) * 60) + return utcTimestamp.ToTime(unit).In(loc) +} + +func fieldValuesFromTime(t time.Time, unit arrow.TimeUnit) (arrow.Timestamp, int16) { + // naive "bitwise" conversion to UTC, keeping the underlying date the same + utc := t.UTC() + naiveUtc := time.Date(t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC) + offsetMinutes := int16(naiveUtc.Sub(t).Minutes()) + // SAFETY: unit MUST have been validated to a valid arrow.TimeUnit value before + // this function. Otherwise, ignoring this error is not safe. + timestamp, _ := arrow.TimestampFromTime(utc, unit) + return timestamp, offsetMinutes +} + +// Get the raw arrow values at the given index +// +// SAFETY: the value at i must not be nil +func (a *TimestampWithOffsetArray) rawValueUnsafe(i int) (arrow.Timestamp, int16, arrow.TimeUnit) { + structs := a.Storage().(*array.Struct) + + timestampField := structs.Field(0) + timestamps := timestampField.(*array.Timestamp) + offsets := structs.Field(1).(*array.Int16) + + timeUnit := timestampField.DataType().(*arrow.TimestampType).Unit + utcTimestamp := timestamps.Value(i) + offsetMinutes := offsets.Value(i) + + return utcTimestamp, offsetMinutes, timeUnit +} + +func (a *TimestampWithOffsetArray) Value(i int) time.Time { + if a.IsNull(i) { + return time.Unix(0, 0) + } + utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) + return timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) +} + +func (a *TimestampWithOffsetArray) Values() []time.Time { + values := make([]time.Time, a.Len()) + for i := range a.Len() { + val := a.Value(i) + values[i] = val + } + return values +} + +func (a *TimestampWithOffsetArray) ValueStr(i int) string { + switch { + case a.IsNull(i): + return array.NullValueStr + default: + return a.Value(i).String() + } +} + +func (a *TimestampWithOffsetArray) MarshalJSON() ([]byte, error) { + values := make([]interface{}, a.Len()) + for i := 0; i < a.Len(); i++ { + if a.IsValid(i) { + utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) + values[i] = timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) + } else { + values[i] = nil + } + } + return json.Marshal(values) +} + +func (a *TimestampWithOffsetArray) GetOneForMarshal(i int, nullable bool) interface{} { + if nullable && a.IsNull(i) { + return nil + } + return a.Value(i) +} + +// TimestampWithOffsetBuilder is a convenience builder for the TimestampWithOffset extension type, +// allowing arrays to be built with boolean values rather than the underlying storage type. +type TimestampWithOffsetBuilder struct { + *array.ExtensionBuilder + + // The layout used to parse any timestamps from strings. Defaults to time.RFC3339 + Layout string + unit arrow.TimeUnit +} + +// NewTimestampWithOffsetBuilder creates a new TimestampWithOffsetBuilder, exposing a convenient and efficient interface +// for writing time.Time values to the underlying storage array. +func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit) *TimestampWithOffsetBuilder { + return &TimestampWithOffsetBuilder{ + unit: unit, + Layout: time.RFC3339, + ExtensionBuilder: array.NewExtensionBuilder(mem, NewTimestampWithOffsetType(unit)), + } +} + +func (b *TimestampWithOffsetBuilder) Append(v time.Time) { + timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) + + structBuilder.Append(true) + structBuilder.FieldBuilder(0).(*array.TimestampBuilder).Append(timestamp) + structBuilder.FieldBuilder(1).(*array.Int16Builder).Append(int16(offsetMinutes)) +} + +func (b *TimestampWithOffsetBuilder) UnsafeAppend(v time.Time) { + timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) + + structBuilder.Append(true) + structBuilder.FieldBuilder(0).(*array.TimestampBuilder).UnsafeAppend(timestamp) + structBuilder.FieldBuilder(1).(*array.Int16Builder).UnsafeAppend(int16(offsetMinutes)) +} + +// By default, this will try to parse the string using the RFC3339 layout. +// +// You can change the default layout by using builder.SetLayout() +func (b *TimestampWithOffsetBuilder) AppendValueFromString(s string) error { + if s == array.NullValueStr { + b.AppendNull() + return nil + } + + parsed, err := time.Parse(b.Layout, s) + if err != nil { + return err + } + + b.Append(parsed) + return nil +} + +func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []bool) { + structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) + timestamps := structBuilder.FieldBuilder(0).(*array.TimestampBuilder); + offsets := structBuilder.FieldBuilder(1).(*array.Int16Builder); + + structBuilder.AppendValues(valids) + + for i, v := range values { + timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + + // SAFETY: by this point we know all buffers have available space given the earlier + // call to structBuilder.AppendValues which calls Reserve internally + if valids[i] { + timestamps.UnsafeAppend(timestamp) + offsets.UnsafeAppend(offsetMinutes) + } else { + timestamps.UnsafeAppendBoolToBitmap(false) + offsets.UnsafeAppendBoolToBitmap(false) + } + } +} + +func (b *TimestampWithOffsetBuilder) UnmarshalOne(dec *json.Decoder) error { + tok, err := dec.Token() + if err != nil { + return fmt.Errorf("failed to decode json: %w", err) + } + + switch raw := tok.(type) { + case string: + t, err := time.Parse(b.Layout, raw) + if err != nil { + return fmt.Errorf("failed to parse string \"%s\" as time.Time using layout \"%s\"", raw, b.Layout) + } + b.Append(t) + case nil: + b.AppendNull() + default: + return fmt.Errorf("expected date string") + } + + return nil +} + +func (b *TimestampWithOffsetBuilder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + +var ( + _ arrow.ExtensionType = (*TimestampWithOffsetType)(nil) + _ array.CustomExtensionBuilder = (*TimestampWithOffsetType)(nil) + _ array.ExtensionArray = (*TimestampWithOffsetArray)(nil) + _ array.Builder = (*TimestampWithOffsetBuilder)(nil) +) diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go new file mode 100644 index 00000000..9c198afd --- /dev/null +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package extensions_test + +import ( + "bytes" + _ "bytes" + "fmt" + "strings" + _ "strings" + "testing" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/extensions" + "github.com/apache/arrow-go/v18/arrow/ipc" + _ "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/apache/arrow-go/v18/internal/json" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var testTimeUnit = arrow.Microsecond + +var testDate1 = time.Date(2025, 01, 01, 00, 00, 00, 00, time.FixedZone("UTC+00:00", 0)) + +var testZone1 = time.FixedZone("UTC-08:00", -8*60*60) +var testDate2 = testDate1.In(testZone1) + +var testZone2 = time.FixedZone("UTC+06:00", +6*60*60) +var testDate3 = testDate1.In(testZone2) + +func TestTimestampWithOffsetTypeBasics(t *testing.T) { + typ := extensions.NewTimestampWithOffsetType(testTimeUnit) + + assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) + assert.True(t, typ.ExtensionEquals(typ)) + + assert.True(t, arrow.TypeEqual(typ, typ)) + assert.True(t, arrow.TypeEqual( + arrow.StructOf( + arrow.Field{ + Name: "timestamp", + Type: &arrow.TimestampType{ + Unit: testTimeUnit, + TimeZone: "UTC", + }, + Nullable: false, + }, + arrow.Field{ + Name: "offset_minutes", + Type: arrow.PrimitiveTypes.Int16, + Nullable: false, + }, + ), + typ.StorageType())) + + assert.Equal(t, "extension", typ.String()) +} + +func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + builder := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit) + builder.Append(testDate1) + builder.AppendNull() + builder.Append(testDate2) + builder.Append(testDate3) + + // it should build the array with the correct size + arr := builder.NewArray() + typedArr := arr.(*extensions.TimestampWithOffsetArray) + assert.Equal(t, 4, arr.Data().Len()) + defer arr.Release() + + // typedArr.Value(i) should return values adjusted for their original timezone + assert.Equal(t, testDate1, typedArr.Value(0)) + assert.Equal(t, testDate2, typedArr.Value(2)) + assert.Equal(t, testDate3, typedArr.Value(3)) + + // storage TimeUnit should be the same as we pass in to the builder, and storage timezone should be UTC + timestampStructField := typedArr.Storage().(*array.Struct).Field(0) + timestampStructDataType := timestampStructField.DataType().(*arrow.TimestampType) + assert.Equal(t, timestampStructDataType.Unit, testTimeUnit) + assert.Equal(t, timestampStructDataType.TimeZone, "UTC") + + // stored values should be equivalent to the raw values in UTC + timestampsArr := timestampStructField.(*array.Timestamp) + assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) + assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) + assert.Equal(t, testDate3.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) + + // the array should encode itself as JSON and string + arrStr := arr.String() + assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s"]`, testDate1, testDate2, testDate3), arrStr) + jsonStr, err := json.Marshal(arr) + assert.NoError(t, err) + + // roundtripping from JSON with array.FromJSON should work + roundtripped, _, err := array.FromJSON(mem, extensions.NewTimestampWithOffsetType(testTimeUnit), bytes.NewReader(jsonStr)) + defer roundtripped.Release() + assert.NoError(t, err) + assert.Truef(t, array.Equal(arr, roundtripped), "expected %s got %s", arr, roundtripped) +} + +func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + { + Name: "timestamp_with_offset", + Nullable: true, + Type: extensions.NewTimestampWithOffsetType(testTimeUnit), + }, + }, nil) + builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + defer builder.Release() + + fieldBuilder := builder.Field(0).(*extensions.TimestampWithOffsetBuilder) + + // append a simple time.Time + fieldBuilder.Append(testDate1) + + // append a null and 2 time.Time all at once + values := []time.Time{ + time.Unix(0, 0).In(time.UTC), + testDate2, + testDate3, + } + valids := []bool{false, true, true} + fieldBuilder.AppendValues(values, valids) + + // append a value from RFC3339 string + fieldBuilder.AppendValueFromString(testDate1.Format(time.RFC3339)) + + // append value formatted in a different string layout + fieldBuilder.Layout = time.RFC3339Nano + fieldBuilder.AppendValueFromString(testDate2.Format(time.RFC3339Nano)) + + record := builder.NewRecordBatch() + + // Record batch should JSON-encode values containing per-row timezone info + json, err := record.MarshalJSON() + require.NoError(t, err) + expect := `[{"timestamp_with_offset":"2025-01-01T00:00:00Z"} +,{"timestamp_with_offset":null} +,{"timestamp_with_offset":"2024-12-31T16:00:00-08:00"} +,{"timestamp_with_offset":"2025-01-01T06:00:00+06:00"} +,{"timestamp_with_offset":"2025-01-01T00:00:00Z"} +,{"timestamp_with_offset":"2024-12-31T16:00:00-08:00"} +]` + require.Equal(t, expect, string(json)) + + // Record batch roundtrip to JSON should work + roundtripped, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(json)) + require.NoError(t, err) + defer roundtripped.Release() + require.Equal(t, schema, roundtripped.Schema()) + assert.Truef(t, array.RecordEqual(record, roundtripped), "expected %s\n\ngot %s", record, roundtripped) +} + +func TestTimestampWithOffsetTypeBatchIPCRoundTrip(t *testing.T) { + raw := `["2025-01-01T00:00:00Z",null,"2024-12-31T16:00:00-08:00","2025-01-01T06:00:00+06:00","2025-01-01T00:00:00Z","2024-12-31T16:00:00-08:00"]` + typ := extensions.NewTimestampWithOffsetType(testTimeUnit) + + arr, _, err := array.FromJSON(memory.DefaultAllocator, typ, strings.NewReader(raw)) + require.NoError(t, err) + defer arr.Release() + + batch := array.NewRecordBatch(arrow.NewSchema([]arrow.Field{{Name: "timestamp_with_offset", Type: typ, Nullable: true}}, nil), []arrow.Array{arr}, -1) + defer batch.Release() + + var written arrow.RecordBatch + { + var buf bytes.Buffer + wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema())) + require.NoError(t, wr.Write(batch)) + require.NoError(t, wr.Close()) + + rdr, err := ipc.NewReader(&buf) + require.NoError(t, err) + written, err = rdr.Read() + require.NoError(t, err) + written.Retain() + defer written.Release() + rdr.Release() + } + + assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s\n\ngot: %s", + batch.Schema(), written.Schema()) + + assert.Truef(t, array.RecordEqual(batch, written), "expected: %s\n\ngot: %s", + batch, written) +} From 47cc4563eb7ba423432b5ba8c0be13f1b3c86fe5 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Mon, 22 Dec 2025 11:44:08 +0200 Subject: [PATCH 10/39] Allow dictionary encoding of `TimestampWithOffset` This commit allows `TimestampWithOffset` to be dict-encoded. - I made `NewTimestampWithOffsetType` take in an input `offsetType arrow.DataType`. It returns an error if the data type is not valid. - I added a new infallible `NewTimestampWithOffsetTypePrimitiveEncoded` to make the encoding explicit. - I added `NewTimestampWithOffsetTypeDictionaryEncoded` which returns an error in case the given type is not a valid dictionary key type. - I made all tests run in a for loop with all possible allowed encoding types, ensuring all encodings work. --- arrow/extensions/timestamp_with_offset.go | 196 ++++++++--- .../extensions/timestamp_with_offset_test.go | 317 +++++++++++------- 2 files changed, 350 insertions(+), 163 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 9e452fe6..6d512b8d 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -17,7 +17,7 @@ package extensions import ( - _ "bytes" + "errors" "fmt" "reflect" "strings" @@ -35,19 +35,31 @@ type TimestampWithOffsetType struct { arrow.ExtensionBase } +func isOffsetTypeOk(offsetType arrow.DataType) bool { + switch offsetType := offsetType.(type) { + case *arrow.Int16Type: + return true + case *arrow.DictionaryType: + return arrow.IsInteger(offsetType.IndexType.ID()) && arrow.TypeEqual(offsetType.ValueType, arrow.PrimitiveTypes.Int16) + default: + return false + } +} + // Whether the storageType is compatible with TimestampWithOffset. // -// Returns (time_unit, ok). If ok is false, time unit is garbage. -func isDataTypeCompatible(storageType arrow.DataType) (arrow.TimeUnit, bool) { +// Returns (time_unit, offset_type, ok). If ok is false, time_unit and offset_type are garbage. +func isDataTypeCompatible(storageType arrow.DataType) (arrow.TimeUnit, arrow.DataType, bool) { timeUnit := arrow.Second + offsetType := arrow.PrimitiveTypes.Int16 switch t := storageType.(type) { case *arrow.StructType: if t.NumFields() != 2 { - return timeUnit, false + return timeUnit, offsetType, false } - maybeTimestamp := t.Field(0); - maybeOffset := t.Field(1); + maybeTimestamp := t.Field(0) + maybeOffset := t.Field(1) timestampOk := false switch timestampType := maybeTimestamp.Type.(type) { @@ -59,49 +71,84 @@ func isDataTypeCompatible(storageType arrow.DataType) (arrow.TimeUnit, bool) { default: } + offsetOk := isOffsetTypeOk(maybeOffset.Type) + ok := maybeTimestamp.Name == "timestamp" && timestampOk && !maybeTimestamp.Nullable && maybeOffset.Name == "offset_minutes" && - arrow.TypeEqual(maybeOffset.Type, arrow.PrimitiveTypes.Int16) && + offsetOk && !maybeOffset.Nullable - return timeUnit, ok + return timeUnit, maybeOffset.Type, ok default: - return timeUnit, false + return timeUnit, offsetType, false } } - // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to -// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Int16), where T is any TimeUnit. -func NewTimestampWithOffsetType(unit arrow.TimeUnit) *TimestampWithOffsetType { +// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=O), where T is any TimeUnit and O is a valid offset type. +// +// The error will be populated if the data type is not a valid encoding of the offsets field. +func NewTimestampWithOffsetType(unit arrow.TimeUnit, offsetType arrow.DataType) (*TimestampWithOffsetType, error) { + if !isOffsetTypeOk(offsetType) { + return nil, errors.New(fmt.Sprintf("Invalid offset type %s", offsetType)) + } + return &TimestampWithOffsetType{ ExtensionBase: arrow.ExtensionBase{ Storage: arrow.StructOf( arrow.Field{ Name: "timestamp", Type: &arrow.TimestampType{ - Unit: unit, + Unit: unit, TimeZone: "UTC", }, Nullable: false, }, arrow.Field{ - Name: "offset_minutes", - Type: arrow.PrimitiveTypes.Int16, + Name: "offset_minutes", + Type: offsetType, Nullable: false, }, ), }, + }, nil +} + + +// NewTimestampWithOffsetTypePrimitiveEncoded creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Int16), where T is any TimeUnit. +func NewTimestampWithOffsetTypePrimitiveEncoded(unit arrow.TimeUnit) *TimestampWithOffsetType { + v, _ := NewTimestampWithOffsetType(unit, arrow.PrimitiveTypes.Int16) + // SAFETY: This should never error as Int16 is always a valid offset type + + return v +} + +// NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Dictionary(I, Int16)), where T is any TimeUnit and I is a +// valid Dictionary index type. +// +// The error will be populated if the index is not a valid dictionary-encoding index type. +func NewTimestampWithOffsetTypeDictionaryEncoded(unit arrow.TimeUnit, index arrow.DataType) (*TimestampWithOffsetType, error) { + offsetType := arrow.DictionaryType{ + IndexType: index, + ValueType: arrow.PrimitiveTypes.Int16, + Ordered: false, } + return NewTimestampWithOffsetType(unit, &offsetType) } -func (b *TimestampWithOffsetType) ArrayType() reflect.Type { return reflect.TypeOf(TimestampWithOffsetArray{}) } +func (b *TimestampWithOffsetType) ArrayType() reflect.Type { + return reflect.TypeOf(TimestampWithOffsetArray{}) +} func (b *TimestampWithOffsetType) ExtensionName() string { return "arrow.timestamp_with_offset" } -func (b *TimestampWithOffsetType) String() string { return fmt.Sprintf("extension<%s>", b.ExtensionName()) } +func (b *TimestampWithOffsetType) String() string { + return fmt.Sprintf("extension<%s>", b.ExtensionName()) +} func (e *TimestampWithOffsetType) MarshalJSON() ([]byte, error) { return []byte(fmt.Sprintf(`{"name":"%s","metadata":%s}`, e.ExtensionName(), e.Serialize())), nil @@ -110,23 +157,30 @@ func (e *TimestampWithOffsetType) MarshalJSON() ([]byte, error) { func (b *TimestampWithOffsetType) Serialize() string { return "" } func (b *TimestampWithOffsetType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { - timeUnit, ok := isDataTypeCompatible(storageType) + timeUnit, offsetType, ok := isDataTypeCompatible(storageType) if !ok { return nil, fmt.Errorf("invalid storage type for TimestampWithOffsetType: %s", storageType.Name()) } - return NewTimestampWithOffsetType(timeUnit), nil + + v, _ := NewTimestampWithOffsetType(timeUnit, offsetType) + // SAFETY: the offsetType has already been checked by isDataTypeCompatible, so we can ignore the error + + return v, nil } func (b *TimestampWithOffsetType) ExtensionEquals(other arrow.ExtensionType) bool { return b.ExtensionName() == other.ExtensionName() } -func (b *TimestampWithOffsetType) TimeUnit() arrow.TimeUnit { +func (b *TimestampWithOffsetType) TimeUnit() arrow.TimeUnit { return b.ExtensionBase.Storage.(*arrow.StructType).Fields()[0].Type.(*arrow.TimestampType).TimeUnit() } func (b *TimestampWithOffsetType) NewBuilder(mem memory.Allocator) array.Builder { - return NewTimestampWithOffsetBuilder(mem, b.TimeUnit()) + v, _ := NewTimestampWithOffsetBuilder(mem, b.TimeUnit(), arrow.PrimitiveTypes.Int16) + // SAFETY: This will never error as Int16 is always a valid type for the offset field + + return v } // TimestampWithOffsetArray is a simple array of struct @@ -153,13 +207,13 @@ func (a *TimestampWithOffsetArray) String() string { } func timeFromFieldValues(utcTimestamp arrow.Timestamp, offsetMinutes int16, unit arrow.TimeUnit) time.Time { - hours := offsetMinutes / 60; + hours := offsetMinutes / 60 minutes := offsetMinutes % 60 if minutes < 0 { minutes = -minutes } - loc := time.FixedZone(fmt.Sprintf("UTC%+03d:%02d", hours, minutes), int(offsetMinutes) * 60) + loc := time.FixedZone(fmt.Sprintf("UTC%+03d:%02d", hours, minutes), int(offsetMinutes)*60) return utcTimestamp.ToTime(unit).In(loc) } @@ -182,11 +236,18 @@ func (a *TimestampWithOffsetArray) rawValueUnsafe(i int) (arrow.Timestamp, int16 timestampField := structs.Field(0) timestamps := timestampField.(*array.Timestamp) - offsets := structs.Field(1).(*array.Int16) timeUnit := timestampField.DataType().(*arrow.TimestampType).Unit - utcTimestamp := timestamps.Value(i) - offsetMinutes := offsets.Value(i) + utcTimestamp := timestamps.Value(i) + + var offsetMinutes int16 + + switch offsets := structs.Field(1).(type) { + case *array.Int16: + offsetMinutes = offsets.Value(i) + case *array.Dictionary: + offsetMinutes = offsets.Dictionary().(*array.Int16).Value(offsets.GetValueIndex(i)) + } return utcTimestamp, offsetMinutes, timeUnit } @@ -243,18 +304,25 @@ type TimestampWithOffsetBuilder struct { *array.ExtensionBuilder // The layout used to parse any timestamps from strings. Defaults to time.RFC3339 - Layout string - unit arrow.TimeUnit + Layout string + unit arrow.TimeUnit + offsetType arrow.DataType } // NewTimestampWithOffsetBuilder creates a new TimestampWithOffsetBuilder, exposing a convenient and efficient interface // for writing time.Time values to the underlying storage array. -func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit) *TimestampWithOffsetBuilder { - return &TimestampWithOffsetBuilder{ - unit: unit, - Layout: time.RFC3339, - ExtensionBuilder: array.NewExtensionBuilder(mem, NewTimestampWithOffsetType(unit)), +func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, offsetType arrow.DataType) (*TimestampWithOffsetBuilder, error) { + dataType, err := NewTimestampWithOffsetType(unit, offsetType) + if err != nil { + return nil, err } + + return &TimestampWithOffsetBuilder{ + unit: unit, + offsetType: offsetType, + Layout: time.RFC3339, + ExtensionBuilder: array.NewExtensionBuilder(mem, dataType), + }, nil } func (b *TimestampWithOffsetBuilder) Append(v time.Time) { @@ -263,7 +331,13 @@ func (b *TimestampWithOffsetBuilder) Append(v time.Time) { structBuilder.Append(true) structBuilder.FieldBuilder(0).(*array.TimestampBuilder).Append(timestamp) - structBuilder.FieldBuilder(1).(*array.Int16Builder).Append(int16(offsetMinutes)) + + switch offsets := structBuilder.FieldBuilder(1).(type) { + case *array.Int16Builder: + offsets.Append(int16(offsetMinutes)) + case *array.Int16DictionaryBuilder: + offsets.Append(int16(offsetMinutes)) + } } func (b *TimestampWithOffsetBuilder) UnsafeAppend(v time.Time) { @@ -272,10 +346,16 @@ func (b *TimestampWithOffsetBuilder) UnsafeAppend(v time.Time) { structBuilder.Append(true) structBuilder.FieldBuilder(0).(*array.TimestampBuilder).UnsafeAppend(timestamp) - structBuilder.FieldBuilder(1).(*array.Int16Builder).UnsafeAppend(int16(offsetMinutes)) + + switch offsets := structBuilder.FieldBuilder(1).(type) { + case *array.Int16Builder: + offsets.UnsafeAppend(int16(offsetMinutes)) + case *array.Int16DictionaryBuilder: + offsets.Append(int16(offsetMinutes)) + } } -// By default, this will try to parse the string using the RFC3339 layout. +// By default, this will try to parse the string using the RFC3339 layout. // // You can change the default layout by using builder.SetLayout() func (b *TimestampWithOffsetBuilder) AppendValueFromString(s string) error { @@ -295,22 +375,36 @@ func (b *TimestampWithOffsetBuilder) AppendValueFromString(s string) error { func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []bool) { structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) - timestamps := structBuilder.FieldBuilder(0).(*array.TimestampBuilder); - offsets := structBuilder.FieldBuilder(1).(*array.Int16Builder); + timestamps := structBuilder.FieldBuilder(0).(*array.TimestampBuilder) structBuilder.AppendValues(valids) - - for i, v := range values { - timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) - - // SAFETY: by this point we know all buffers have available space given the earlier - // call to structBuilder.AppendValues which calls Reserve internally - if valids[i] { - timestamps.UnsafeAppend(timestamp) - offsets.UnsafeAppend(offsetMinutes) - } else { - timestamps.UnsafeAppendBoolToBitmap(false) - offsets.UnsafeAppendBoolToBitmap(false) + // SAFETY: by this point we know all buffers have available space given the earlier + // call to structBuilder.AppendValues which calls Reserve internally, so it's OK to + // call UnsafeAppend on inner builders + + switch offsets := structBuilder.FieldBuilder(1).(type) { + case *array.Int16Builder: + for i, v := range values { + timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + if valids[i] { + timestamps.UnsafeAppend(timestamp) + offsets.UnsafeAppend(offsetMinutes) + } else { + timestamps.UnsafeAppendBoolToBitmap(false) + offsets.UnsafeAppendBoolToBitmap(false) + } + } + case *array.Int16DictionaryBuilder: + for i, v := range values { + timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + if valids[i] { + timestamps.UnsafeAppend(timestamp) + // TODO: I was here, this needs to be equivalent to UnsafeAppend + offsets.Append(offsetMinutes) + } else { + timestamps.UnsafeAppendBoolToBitmap(false) + offsets.UnsafeAppendBoolToBitmap(false) + } } } } @@ -350,5 +444,5 @@ var ( _ arrow.ExtensionType = (*TimestampWithOffsetType)(nil) _ array.CustomExtensionBuilder = (*TimestampWithOffsetType)(nil) _ array.ExtensionArray = (*TimestampWithOffsetArray)(nil) - _ array.Builder = (*TimestampWithOffsetBuilder)(nil) + _ array.Builder = (*TimestampWithOffsetBuilder)(nil) ) diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 9c198afd..fc08375c 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -18,10 +18,7 @@ package extensions_test import ( "bytes" - _ "bytes" "fmt" - "strings" - _ "strings" "testing" "time" @@ -29,7 +26,6 @@ import ( "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/extensions" "github.com/apache/arrow-go/v18/arrow/ipc" - _ "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" "github.com/apache/arrow-go/v18/internal/json" "github.com/stretchr/testify/assert" @@ -46,8 +42,33 @@ var testDate2 = testDate1.In(testZone1) var testZone2 = time.FixedZone("UTC+06:00", +6*60*60) var testDate3 = testDate1.In(testZone2) -func TestTimestampWithOffsetTypeBasics(t *testing.T) { - typ := extensions.NewTimestampWithOffsetType(testTimeUnit) +func dict(index arrow.DataType) arrow.DataType { + return &arrow.DictionaryType{ + IndexType: index, + ValueType: arrow.PrimitiveTypes.Int16, + Ordered: false, + } +} + +// All tests use this in a for loop to make sure everything works for every possible +// encoding of offsets (primitive, dictionary, run-end) +var allAllowedOffsetTypes = []arrow.DataType{ + // primitive offsetType + arrow.PrimitiveTypes.Int16, + + // dict-encoded offsetType + dict(arrow.PrimitiveTypes.Uint8), + dict(arrow.PrimitiveTypes.Uint16), + dict(arrow.PrimitiveTypes.Uint32), + dict(arrow.PrimitiveTypes.Uint64), + dict(arrow.PrimitiveTypes.Int8), + dict(arrow.PrimitiveTypes.Int16), + dict(arrow.PrimitiveTypes.Int32), + dict(arrow.PrimitiveTypes.Int64), +} + +func TestTimestampWithOffsetTypePrimitiveBasics(t *testing.T) { + typ := extensions.NewTimestampWithOffsetTypePrimitiveEncoded(testTimeUnit) assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) assert.True(t, typ.ExtensionEquals(typ)) @@ -74,136 +95,208 @@ func TestTimestampWithOffsetTypeBasics(t *testing.T) { assert.Equal(t, "extension", typ.String()) } +func TestTimestampWithOffsetTypeDictionaryEncodedBasics(t *testing.T) { + invalidIndexType := arrow.PrimitiveTypes.Float32 + _, err := extensions.NewTimestampWithOffsetTypeDictionaryEncoded(testTimeUnit, invalidIndexType) + assert.True(t, err != nil, "Err should not be nil if index type is invalid dict key") + + indexTypes := []arrow.DataType{ + arrow.PrimitiveTypes.Uint8, + arrow.PrimitiveTypes.Uint16, + arrow.PrimitiveTypes.Uint32, + arrow.PrimitiveTypes.Uint64, + arrow.PrimitiveTypes.Int8, + arrow.PrimitiveTypes.Int16, + arrow.PrimitiveTypes.Int32, + arrow.PrimitiveTypes.Int64, + }; + + for _, indexType := range indexTypes { + typ, err := extensions.NewTimestampWithOffsetTypeDictionaryEncoded(testTimeUnit, indexType) + assert.True(t, err == nil, "Err should be nil") + + assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) + assert.True(t, typ.ExtensionEquals(typ)) + + assert.True(t, arrow.TypeEqual(typ, typ)) + assert.True(t, arrow.TypeEqual( + arrow.StructOf( + arrow.Field{ + Name: "timestamp", + Type: &arrow.TimestampType{ + Unit: testTimeUnit, + TimeZone: "UTC", + }, + Nullable: false, + }, + arrow.Field{ + Name: "offset_minutes", + Type: dict(indexType), + Nullable: false, + }, + ), + typ.StorageType())) + + assert.Equal(t, "extension", typ.String()) + } +} + func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - builder := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit) - builder.Append(testDate1) - builder.AppendNull() - builder.Append(testDate2) - builder.Append(testDate3) - - // it should build the array with the correct size - arr := builder.NewArray() - typedArr := arr.(*extensions.TimestampWithOffsetArray) - assert.Equal(t, 4, arr.Data().Len()) - defer arr.Release() - - // typedArr.Value(i) should return values adjusted for their original timezone - assert.Equal(t, testDate1, typedArr.Value(0)) - assert.Equal(t, testDate2, typedArr.Value(2)) - assert.Equal(t, testDate3, typedArr.Value(3)) - - // storage TimeUnit should be the same as we pass in to the builder, and storage timezone should be UTC - timestampStructField := typedArr.Storage().(*array.Struct).Field(0) - timestampStructDataType := timestampStructField.DataType().(*arrow.TimestampType) - assert.Equal(t, timestampStructDataType.Unit, testTimeUnit) - assert.Equal(t, timestampStructDataType.TimeZone, "UTC") - - // stored values should be equivalent to the raw values in UTC - timestampsArr := timestampStructField.(*array.Timestamp) - assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) - assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) - assert.Equal(t, testDate3.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) - - // the array should encode itself as JSON and string - arrStr := arr.String() - assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s"]`, testDate1, testDate2, testDate3), arrStr) - jsonStr, err := json.Marshal(arr) - assert.NoError(t, err) - - // roundtripping from JSON with array.FromJSON should work - roundtripped, _, err := array.FromJSON(mem, extensions.NewTimestampWithOffsetType(testTimeUnit), bytes.NewReader(jsonStr)) - defer roundtripped.Release() + // NOTE: we need to compare the arrays parsed from JSON with a primitive-encoded array, since that will always + // use that encoding (there is no way to pass a flag to array.FromJSON to say explicitly what storage type you want) + primitiveBuilder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, arrow.PrimitiveTypes.Int16) assert.NoError(t, err) - assert.Truef(t, array.Equal(arr, roundtripped), "expected %s got %s", arr, roundtripped) + primitiveBuilder.Append(testDate1) + primitiveBuilder.AppendNull() + primitiveBuilder.Append(testDate2) + primitiveBuilder.Append(testDate3) + jsonComparisonArr := primitiveBuilder.NewArray() + defer jsonComparisonArr.Release() + + for _, offsetType := range allAllowedOffsetTypes { + builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + + builder.Append(testDate1) + builder.AppendNull() + builder.Append(testDate2) + builder.Append(testDate3) + + // it should build the array with the correct size + arr := builder.NewArray() + typedArr := arr.(*extensions.TimestampWithOffsetArray) + assert.Equal(t, 4, arr.Data().Len()) + defer arr.Release() + + // typedArr.Value(i) should return values adjusted for their original timezone + assert.Equal(t, testDate1, typedArr.Value(0)) + assert.Equal(t, testDate2, typedArr.Value(2)) + assert.Equal(t, testDate3, typedArr.Value(3)) + + // storage TimeUnit should be the same as we pass in to the builder, and storage timezone should be UTC + timestampStructField := typedArr.Storage().(*array.Struct).Field(0) + timestampStructDataType := timestampStructField.DataType().(*arrow.TimestampType) + assert.Equal(t, timestampStructDataType.Unit, testTimeUnit) + assert.Equal(t, timestampStructDataType.TimeZone, "UTC") + + // stored values should be equivalent to the raw values in UTC + timestampsArr := timestampStructField.(*array.Timestamp) + assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) + assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) + assert.Equal(t, testDate3.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) + + // the array should encode itself as JSON and string + arrStr := arr.String() + assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s"]`, testDate1, testDate2, testDate3), arrStr) + jsonStr, err := json.Marshal(arr) + assert.NoError(t, err) + + // roundtripping from JSON with array.FromJSON should work + expectedDataType, _ := extensions.NewTimestampWithOffsetType(testTimeUnit, offsetType) + roundtripped, _, err := array.FromJSON(mem, expectedDataType, bytes.NewReader(jsonStr)) + defer roundtripped.Release() + assert.NoError(t, err) + assert.Truef(t, array.Equal(jsonComparisonArr, roundtripped), "expected %s\n\ngot %s", jsonComparisonArr, roundtripped) + } } func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { - schema := arrow.NewSchema([]arrow.Field{ - { - Name: "timestamp_with_offset", - Nullable: true, - Type: extensions.NewTimestampWithOffsetType(testTimeUnit), - }, - }, nil) - builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) - defer builder.Release() - - fieldBuilder := builder.Field(0).(*extensions.TimestampWithOffsetBuilder) - - // append a simple time.Time - fieldBuilder.Append(testDate1) - - // append a null and 2 time.Time all at once - values := []time.Time{ - time.Unix(0, 0).In(time.UTC), - testDate2, - testDate3, - } - valids := []bool{false, true, true} - fieldBuilder.AppendValues(values, valids) + for _, offsetType := range allAllowedOffsetTypes { + dataType, _ := extensions.NewTimestampWithOffsetType(testTimeUnit, offsetType) + schema := arrow.NewSchema([]arrow.Field{ + { + Name: "timestamp_with_offset", + Nullable: true, + Type: dataType, + }, + }, nil) + builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + defer builder.Release() + + fieldBuilder := builder.Field(0).(*extensions.TimestampWithOffsetBuilder) + + // append a simple time.Time + fieldBuilder.Append(testDate1) + + // append a null and 2 time.Time all at once + values := []time.Time{ + time.Unix(0, 0).In(time.UTC), + testDate2, + testDate3, + } + valids := []bool{false, true, true} + fieldBuilder.AppendValues(values, valids) - // append a value from RFC3339 string - fieldBuilder.AppendValueFromString(testDate1.Format(time.RFC3339)) + // append a value from RFC3339 string + fieldBuilder.AppendValueFromString(testDate1.Format(time.RFC3339)) - // append value formatted in a different string layout - fieldBuilder.Layout = time.RFC3339Nano - fieldBuilder.AppendValueFromString(testDate2.Format(time.RFC3339Nano)) + // append value formatted in a different string layout + fieldBuilder.Layout = time.RFC3339Nano + fieldBuilder.AppendValueFromString(testDate2.Format(time.RFC3339Nano)) - record := builder.NewRecordBatch() + record := builder.NewRecordBatch() - // Record batch should JSON-encode values containing per-row timezone info - json, err := record.MarshalJSON() - require.NoError(t, err) - expect := `[{"timestamp_with_offset":"2025-01-01T00:00:00Z"} + // Record batch should JSON-encode values containing per-row timezone info + json, err := record.MarshalJSON() + require.NoError(t, err) + expect := `[{"timestamp_with_offset":"2025-01-01T00:00:00Z"} ,{"timestamp_with_offset":null} ,{"timestamp_with_offset":"2024-12-31T16:00:00-08:00"} ,{"timestamp_with_offset":"2025-01-01T06:00:00+06:00"} ,{"timestamp_with_offset":"2025-01-01T00:00:00Z"} ,{"timestamp_with_offset":"2024-12-31T16:00:00-08:00"} ]` - require.Equal(t, expect, string(json)) - - // Record batch roundtrip to JSON should work - roundtripped, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(json)) - require.NoError(t, err) - defer roundtripped.Release() - require.Equal(t, schema, roundtripped.Schema()) - assert.Truef(t, array.RecordEqual(record, roundtripped), "expected %s\n\ngot %s", record, roundtripped) + require.Equal(t, expect, string(json)) + + // Record batch roundtrip to JSON should work + roundtripped, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(json)) + require.NoError(t, err) + defer roundtripped.Release() + require.Equal(t, schema, roundtripped.Schema()) + assert.Truef(t, array.RecordEqual(record, roundtripped), "expected %s\n\ngot %s", record, roundtripped) + } } func TestTimestampWithOffsetTypeBatchIPCRoundTrip(t *testing.T) { - raw := `["2025-01-01T00:00:00Z",null,"2024-12-31T16:00:00-08:00","2025-01-01T06:00:00+06:00","2025-01-01T00:00:00Z","2024-12-31T16:00:00-08:00"]` - typ := extensions.NewTimestampWithOffsetType(testTimeUnit) + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) - arr, _, err := array.FromJSON(memory.DefaultAllocator, typ, strings.NewReader(raw)) - require.NoError(t, err) - defer arr.Release() + for _, offsetType := range allAllowedOffsetTypes { + builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + builder.Append(testDate1) + builder.AppendNull() + builder.Append(testDate2) + builder.Append(testDate3) + arr := builder.NewArray() + defer arr.Release() - batch := array.NewRecordBatch(arrow.NewSchema([]arrow.Field{{Name: "timestamp_with_offset", Type: typ, Nullable: true}}, nil), []arrow.Array{arr}, -1) - defer batch.Release() + typ, _ := extensions.NewTimestampWithOffsetType(testTimeUnit, offsetType) - var written arrow.RecordBatch - { - var buf bytes.Buffer - wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema())) - require.NoError(t, wr.Write(batch)) - require.NoError(t, wr.Close()) + batch := array.NewRecordBatch(arrow.NewSchema([]arrow.Field{{Name: "timestamp_with_offset", Type: typ, Nullable: true}}, nil), []arrow.Array{arr}, -1) + defer batch.Release() - rdr, err := ipc.NewReader(&buf) - require.NoError(t, err) - written, err = rdr.Read() - require.NoError(t, err) - written.Retain() - defer written.Release() - rdr.Release() + var written arrow.RecordBatch + { + var buf bytes.Buffer + wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema())) + require.NoError(t, wr.Write(batch)) + require.NoError(t, wr.Close()) + + rdr, err := ipc.NewReader(&buf) + require.NoError(t, err) + written, err = rdr.Read() + require.NoError(t, err) + written.Retain() + defer written.Release() + rdr.Release() + } + + assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s\n\ngot: %s", + batch.Schema(), written.Schema()) + + assert.Truef(t, array.RecordEqual(batch, written), "expected: %s\n\ngot: %s", + batch, written) } - - assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s\n\ngot: %s", - batch.Schema(), written.Schema()) - - assert.Truef(t, array.RecordEqual(batch, written), "expected: %s\n\ngot: %s", - batch, written) } From 8c4aeb55365a904eff33a716961f8e8424b0bd07 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Mon, 22 Dec 2025 14:13:57 +0200 Subject: [PATCH 11/39] Allow run-end encoding of `TimestampWithOffset` --- arrow/extensions/timestamp_with_offset.go | 79 +++++++++++++++++-- .../extensions/timestamp_with_offset_test.go | 52 ++++++++++++ 2 files changed, 126 insertions(+), 5 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 6d512b8d..f5d7b3f3 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -19,6 +19,7 @@ package extensions import ( "errors" "fmt" + "math" "reflect" "strings" "time" @@ -41,6 +42,13 @@ func isOffsetTypeOk(offsetType arrow.DataType) bool { return true case *arrow.DictionaryType: return arrow.IsInteger(offsetType.IndexType.ID()) && arrow.TypeEqual(offsetType.ValueType, arrow.PrimitiveTypes.Int16) + case *arrow.RunEndEncodedType: + return offsetType.ValidRunEndsType(offsetType.RunEnds()) && + arrow.TypeEqual(offsetType.Encoded(), arrow.PrimitiveTypes.Int16) + // FIXME: Technically this should be non-nullable, but a Arrow IPC does not deserialize + // ValueNullable properly, so enforcing this here would always fail when reading from an IPC + // stream + // !offsetType.ValueNullable default: return false } @@ -140,6 +148,21 @@ func NewTimestampWithOffsetTypeDictionaryEncoded(unit arrow.TimeUnit, index arro return NewTimestampWithOffsetType(unit, &offsetType) } +// NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=RunEndEncoded(E, Int16)), where T is any TimeUnit and E is a +// valid run-ends type. +// +// The error will be populated if runEnds is not a valid run-end encoding run-ends type. +func NewTimestampWithOffsetTypeRunEndEncoded(unit arrow.TimeUnit, runEnds arrow.DataType) (*TimestampWithOffsetType, error) { + offsetType := arrow.RunEndEncodedOf(runEnds, arrow.PrimitiveTypes.Int16) + if !offsetType.ValidRunEndsType(runEnds) { + return nil, errors.New(fmt.Sprintf("Invalid run-ends type %s", runEnds)) + } + + return NewTimestampWithOffsetType(unit, offsetType) +} + + func (b *TimestampWithOffsetType) ArrayType() reflect.Type { return reflect.TypeOf(TimestampWithOffsetArray{}) } @@ -247,6 +270,8 @@ func (a *TimestampWithOffsetArray) rawValueUnsafe(i int) (arrow.Timestamp, int16 offsetMinutes = offsets.Value(i) case *array.Dictionary: offsetMinutes = offsets.Dictionary().(*array.Int16).Value(offsets.GetValueIndex(i)) + case *array.RunEndEncoded: + offsetMinutes = offsets.Values().(*array.Int16).Value(offsets.GetPhysicalIndex(i)) } return utcTimestamp, offsetMinutes, timeUnit @@ -262,6 +287,7 @@ func (a *TimestampWithOffsetArray) Value(i int) time.Time { func (a *TimestampWithOffsetArray) Values() []time.Time { values := make([]time.Time, a.Len()) + // TODO: optimize for run-end encoding for i := range a.Len() { val := a.Value(i) values[i] = val @@ -280,6 +306,7 @@ func (a *TimestampWithOffsetArray) ValueStr(i int) string { func (a *TimestampWithOffsetArray) MarshalJSON() ([]byte, error) { values := make([]interface{}, a.Len()) + // TODO: optimize for run-end encoding for i := 0; i < a.Len(); i++ { if a.IsValid(i) { utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) @@ -307,6 +334,8 @@ type TimestampWithOffsetBuilder struct { Layout string unit arrow.TimeUnit offsetType arrow.DataType + // lastOffset is only used to determine when to start new runs with run-end encoded offsets + lastOffset int16 } // NewTimestampWithOffsetBuilder creates a new TimestampWithOffsetBuilder, exposing a convenient and efficient interface @@ -320,6 +349,7 @@ func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, of return &TimestampWithOffsetBuilder{ unit: unit, offsetType: offsetType, + lastOffset: math.MaxInt16, Layout: time.RFC3339, ExtensionBuilder: array.NewExtensionBuilder(mem, dataType), }, nil @@ -327,6 +357,7 @@ func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, of func (b *TimestampWithOffsetBuilder) Append(v time.Time) { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + offsetMinutes16 := int16(offsetMinutes) structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) structBuilder.Append(true) @@ -334,14 +365,25 @@ func (b *TimestampWithOffsetBuilder) Append(v time.Time) { switch offsets := structBuilder.FieldBuilder(1).(type) { case *array.Int16Builder: - offsets.Append(int16(offsetMinutes)) + offsets.Append(offsetMinutes16) case *array.Int16DictionaryBuilder: - offsets.Append(int16(offsetMinutes)) + offsets.Append(offsetMinutes16) + case *array.RunEndEncodedBuilder: + if offsetMinutes != b.lastOffset { + offsets.Append(1) + offsets.ValueBuilder().(*array.Int16Builder).Append(offsetMinutes16) + } else { + offsets.ContinueRun(1) + } + + b.lastOffset = offsetMinutes16 } + } func (b *TimestampWithOffsetBuilder) UnsafeAppend(v time.Time) { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + offsetMinutes16 := int16(offsetMinutes) structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) structBuilder.Append(true) @@ -349,9 +391,18 @@ func (b *TimestampWithOffsetBuilder) UnsafeAppend(v time.Time) { switch offsets := structBuilder.FieldBuilder(1).(type) { case *array.Int16Builder: - offsets.UnsafeAppend(int16(offsetMinutes)) + offsets.UnsafeAppend(offsetMinutes16) case *array.Int16DictionaryBuilder: - offsets.Append(int16(offsetMinutes)) + offsets.Append(offsetMinutes16) + case *array.RunEndEncodedBuilder: + if offsetMinutes != b.lastOffset { + offsets.Append(1) + offsets.ValueBuilder().(*array.Int16Builder).Append(offsetMinutes16) + } else { + offsets.ContinueRun(1) + } + + b.lastOffset = offsetMinutes16 } } @@ -399,13 +450,31 @@ func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []b timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) if valids[i] { timestamps.UnsafeAppend(timestamp) - // TODO: I was here, this needs to be equivalent to UnsafeAppend offsets.Append(offsetMinutes) } else { timestamps.UnsafeAppendBoolToBitmap(false) offsets.UnsafeAppendBoolToBitmap(false) } } + case *array.RunEndEncodedBuilder: + offsetValuesBuilder := offsets.ValueBuilder().(*array.Int16Builder) + for i, v := range values { + timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) + if valids[i] { + timestamps.UnsafeAppend(timestamp) + offsetMinutes16 := int16(offsetMinutes) + if offsetMinutes != b.lastOffset { + offsets.Append(1) + offsetValuesBuilder.Append(offsetMinutes16) + } else { + offsets.ContinueRun(1) + } + b.lastOffset = offsetMinutes16 + } else { + timestamps.UnsafeAppendBoolToBitmap(false) + offsets.UnsafeAppendBoolToBitmap(false) + } + } } } diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index fc08375c..f7e6c278 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -50,6 +50,12 @@ func dict(index arrow.DataType) arrow.DataType { } } +func ree(runEnds arrow.DataType) arrow.DataType { + v := arrow.RunEndEncodedOf(runEnds, arrow.PrimitiveTypes.Int16) + v.ValueNullable = false + return v +} + // All tests use this in a for loop to make sure everything works for every possible // encoding of offsets (primitive, dictionary, run-end) var allAllowedOffsetTypes = []arrow.DataType{ @@ -65,6 +71,11 @@ var allAllowedOffsetTypes = []arrow.DataType{ dict(arrow.PrimitiveTypes.Int16), dict(arrow.PrimitiveTypes.Int32), dict(arrow.PrimitiveTypes.Int64), + + // run-end encoded offsetType + ree(arrow.PrimitiveTypes.Int16), + ree(arrow.PrimitiveTypes.Int32), + ree(arrow.PrimitiveTypes.Int64), } func TestTimestampWithOffsetTypePrimitiveBasics(t *testing.T) { @@ -141,6 +152,47 @@ func TestTimestampWithOffsetTypeDictionaryEncodedBasics(t *testing.T) { } } +func TestTimestampWithOffsetTypeRunEndEncodedBasics(t *testing.T) { + invalidRunEndsType := arrow.PrimitiveTypes.Float32 + _, err := extensions.NewTimestampWithOffsetTypeRunEndEncoded(testTimeUnit, invalidRunEndsType) + assert.True(t, err != nil, "Err should not be nil if run ends type is invalid") + + runEndsTypes := []arrow.DataType{ + arrow.PrimitiveTypes.Int16, + arrow.PrimitiveTypes.Int32, + arrow.PrimitiveTypes.Int64, + }; + + for _, indexType := range runEndsTypes { + typ, err := extensions.NewTimestampWithOffsetTypeRunEndEncoded(testTimeUnit, indexType) + assert.True(t, err == nil, "Err should be nil") + + assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) + assert.True(t, typ.ExtensionEquals(typ)) + + assert.True(t, arrow.TypeEqual(typ, typ)) + assert.True(t, arrow.TypeEqual( + arrow.StructOf( + arrow.Field{ + Name: "timestamp", + Type: &arrow.TimestampType{ + Unit: testTimeUnit, + TimeZone: "UTC", + }, + Nullable: false, + }, + arrow.Field{ + Name: "offset_minutes", + Type: ree(indexType), + Nullable: false, + }, + ), + typ.StorageType())) + + assert.Equal(t, "extension", typ.String()) + } +} + func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) From 215dbd04aef0421ddb46aaa3c4bb05ced5079c54 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Mon, 22 Dec 2025 16:31:55 +0200 Subject: [PATCH 12/39] Optimize `Values()` and `MarshalJSON()` for REE Smartly iterate over offsets if they're run-end encoded instead of doing a binary search at every iteration. This makes the loops O(n) instead of O(n*logn). --- arrow/extensions/timestamp_with_offset.go | 79 +++++++++++++++++++---- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index f5d7b3f3..6d282ea3 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -285,13 +285,70 @@ func (a *TimestampWithOffsetArray) Value(i int) time.Time { return timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) } +// Iterates over the array and calls the callback with the timestamp at each position. If it's null, +// the timestamp will be nil. +// +// This will iterate using the fastest method given the underlying storage array +func (a* TimestampWithOffsetArray) iterValues(callback func(i int, utcTimestamp *time.Time)) { + structs := a.Storage().(*array.Struct) + offsets := structs.Field(1) + if reeOffsets, isRee := offsets.(*array.RunEndEncoded); isRee { + timestampField := structs.Field(0) + timeUnit := timestampField.DataType().(*arrow.TimestampType).Unit + timestamps := timestampField.(*array.Timestamp) + + offsetValues := reeOffsets.Values().(*array.Int16) + offsetPhysicalIdx := 0 + + var getRunEnd (func(int) int) + switch arr := reeOffsets.RunEndsArr().(type) { + case *array.Int16: + getRunEnd = func(idx int) int { return int(arr.Value(idx)) } + case *array.Int32: + getRunEnd = func(idx int) int { return int(arr.Value(idx)) } + case *array.Int64: + getRunEnd = func(idx int) int { return int(arr.Value(idx)) } + } + + for i := 0; i < a.Len(); i++ { + if i >= getRunEnd(offsetPhysicalIdx) { + offsetPhysicalIdx += 1 + } + + timestamp := (*time.Time)(nil) + if a.IsValid(i) { + utcTimestamp := timestamps.Value(i) + offsetMinutes := offsetValues.Value(offsetPhysicalIdx) + v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) + timestamp = &v + } + + callback(i, timestamp) + } + } else { + for i := 0; i < a.Len(); i++ { + timestamp := (*time.Time)(nil) + if a.IsValid(i) { + utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) + v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) + timestamp = &v + } + + callback(i, timestamp) + } + } +} + + func (a *TimestampWithOffsetArray) Values() []time.Time { values := make([]time.Time, a.Len()) - // TODO: optimize for run-end encoding - for i := range a.Len() { - val := a.Value(i) - values[i] = val - } + a.iterValues(func(i int, timestamp *time.Time) { + if timestamp == nil { + values[i] = time.Unix(0, 0) + } else { + values[i] = *timestamp + } + }) return values } @@ -306,15 +363,9 @@ func (a *TimestampWithOffsetArray) ValueStr(i int) string { func (a *TimestampWithOffsetArray) MarshalJSON() ([]byte, error) { values := make([]interface{}, a.Len()) - // TODO: optimize for run-end encoding - for i := 0; i < a.Len(); i++ { - if a.IsValid(i) { - utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) - values[i] = timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) - } else { - values[i] = nil - } - } + a.iterValues(func(i int, timestamp *time.Time) { + values[i] = timestamp + }) return json.Marshal(values) } From faffeba0a7db6de2030739460c8ba6d8295fcb02 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 30 Jan 2026 12:43:22 +0100 Subject: [PATCH 13/39] General code smell patches Changed a lot of things based on Matt's suggestions. --- arrow/extensions/timestamp_with_offset.go | 183 +++++++++++----------- 1 file changed, 89 insertions(+), 94 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 6d282ea3..9cbe49d3 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -19,8 +19,10 @@ package extensions import ( "errors" "fmt" + "iter" "math" "reflect" + "slices" "strings" "time" @@ -57,41 +59,31 @@ func isOffsetTypeOk(offsetType arrow.DataType) bool { // Whether the storageType is compatible with TimestampWithOffset. // // Returns (time_unit, offset_type, ok). If ok is false, time_unit and offset_type are garbage. -func isDataTypeCompatible(storageType arrow.DataType) (arrow.TimeUnit, arrow.DataType, bool) { - timeUnit := arrow.Second - offsetType := arrow.PrimitiveTypes.Int16 - switch t := storageType.(type) { - case *arrow.StructType: - if t.NumFields() != 2 { - return timeUnit, offsetType, false - } - - maybeTimestamp := t.Field(0) - maybeOffset := t.Field(1) - - timestampOk := false - switch timestampType := maybeTimestamp.Type.(type) { - case *arrow.TimestampType: - if timestampType.TimeZone == "UTC" { - timestampOk = true - timeUnit = timestampType.TimeUnit() - } - default: - } +func isDataTypeCompatible(storageType arrow.DataType) (unit arrow.TimeUnit, offsetType arrow.DataType, ok bool) { + unit = arrow.Second + offsetType = arrow.PrimitiveTypes.Int16 + ok = false + + st, compat := storageType.(*arrow.StructType) + if !compat || st.NumFields() != 2 { + return + } - offsetOk := isOffsetTypeOk(maybeOffset.Type) + if ts, compat := st.Field(0).Type.(*arrow.TimestampType); compat && ts.TimeZone == "UTC" { + unit = ts.TimeUnit() + } else { + return + } - ok := maybeTimestamp.Name == "timestamp" && - timestampOk && - !maybeTimestamp.Nullable && - maybeOffset.Name == "offset_minutes" && - offsetOk && - !maybeOffset.Nullable + maybeOffset := st.Field(1) + offsetType = maybeOffset.Type - return timeUnit, maybeOffset.Type, ok - default: - return timeUnit, offsetType, false - } + ok = st.Field(0).Name == "timestamp" && + !st.Field(0).Nullable && + maybeOffset.Name == "offset_minutes" && + isOffsetTypeOk(offsetType) && + !maybeOffset.Nullable + return } // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to @@ -185,10 +177,7 @@ func (b *TimestampWithOffsetType) Deserialize(storageType arrow.DataType, data s return nil, fmt.Errorf("invalid storage type for TimestampWithOffsetType: %s", storageType.Name()) } - v, _ := NewTimestampWithOffsetType(timeUnit, offsetType) - // SAFETY: the offsetType has already been checked by isDataTypeCompatible, so we can ignore the error - - return v, nil + return NewTimestampWithOffsetType(timeUnit, offsetType) } func (b *TimestampWithOffsetType) ExtensionEquals(other arrow.ExtensionType) bool { @@ -196,7 +185,7 @@ func (b *TimestampWithOffsetType) ExtensionEquals(other arrow.ExtensionType) boo } func (b *TimestampWithOffsetType) TimeUnit() arrow.TimeUnit { - return b.ExtensionBase.Storage.(*arrow.StructType).Fields()[0].Type.(*arrow.TimestampType).TimeUnit() + return b.ExtensionBase.Storage.(*arrow.StructType).Field(0).Type.(*arrow.TimestampType).TimeUnit() } func (b *TimestampWithOffsetType) NewBuilder(mem memory.Allocator) array.Builder { @@ -245,6 +234,7 @@ func fieldValuesFromTime(t time.Time, unit arrow.TimeUnit) (arrow.Timestamp, int utc := t.UTC() naiveUtc := time.Date(t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC) offsetMinutes := int16(naiveUtc.Sub(t).Minutes()) + // SAFETY: unit MUST have been validated to a valid arrow.TimeUnit value before // this function. Otherwise, ignoring this error is not safe. timestamp, _ := arrow.TimestampFromTime(utc, unit) @@ -285,71 +275,70 @@ func (a *TimestampWithOffsetArray) Value(i int) time.Time { return timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) } -// Iterates over the array and calls the callback with the timestamp at each position. If it's null, -// the timestamp will be nil. +// Iterates over the array returning the timestamp at each position. +// +// If the timestamp is null, the returned time will be the unix epoch. // // This will iterate using the fastest method given the underlying storage array -func (a* TimestampWithOffsetArray) iterValues(callback func(i int, utcTimestamp *time.Time)) { - structs := a.Storage().(*array.Struct) - offsets := structs.Field(1) - if reeOffsets, isRee := offsets.(*array.RunEndEncoded); isRee { - timestampField := structs.Field(0) - timeUnit := timestampField.DataType().(*arrow.TimestampType).Unit - timestamps := timestampField.(*array.Timestamp) - - offsetValues := reeOffsets.Values().(*array.Int16) - offsetPhysicalIdx := 0 - - var getRunEnd (func(int) int) - switch arr := reeOffsets.RunEndsArr().(type) { - case *array.Int16: - getRunEnd = func(idx int) int { return int(arr.Value(idx)) } - case *array.Int32: - getRunEnd = func(idx int) int { return int(arr.Value(idx)) } - case *array.Int64: - getRunEnd = func(idx int) int { return int(arr.Value(idx)) } - } - - for i := 0; i < a.Len(); i++ { - if i >= getRunEnd(offsetPhysicalIdx) { - offsetPhysicalIdx += 1 +func (a* TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { + return func(yield func(time.Time) bool) { + structs := a.Storage().(*array.Struct) + offsets := structs.Field(1) + if reeOffsets, isRee := offsets.(*array.RunEndEncoded); isRee { + timestampField := structs.Field(0) + timeUnit := timestampField.DataType().(*arrow.TimestampType).Unit + timestamps := timestampField.(*array.Timestamp) + + offsetValues := reeOffsets.Values().(*array.Int16) + offsetPhysicalIdx := 0 + + var getRunEnd (func(int) int) + switch arr := reeOffsets.RunEndsArr().(type) { + case *array.Int16: + getRunEnd = func(idx int) int { return int(arr.Value(idx)) } + case *array.Int32: + getRunEnd = func(idx int) int { return int(arr.Value(idx)) } + case *array.Int64: + getRunEnd = func(idx int) int { return int(arr.Value(idx)) } } - timestamp := (*time.Time)(nil) - if a.IsValid(i) { - utcTimestamp := timestamps.Value(i) - offsetMinutes := offsetValues.Value(offsetPhysicalIdx) - v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) - timestamp = &v - } + for i := 0; i < a.Len(); i++ { + if i >= getRunEnd(offsetPhysicalIdx) { + offsetPhysicalIdx += 1 + } - callback(i, timestamp) - } - } else { - for i := 0; i < a.Len(); i++ { - timestamp := (*time.Time)(nil) - if a.IsValid(i) { - utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) - v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) - timestamp = &v - } - - callback(i, timestamp) + ts:= time.Unix(0, 0) + if a.IsValid(i) { + utcTimestamp := timestamps.Value(i) + offsetMinutes := offsetValues.Value(offsetPhysicalIdx) + v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) + ts = v + } + + if !yield(ts) { + return + } + } + } else { + for i := 0; i < a.Len(); i++ { + ts:= time.Unix(0, 0) + if a.IsValid(i) { + utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) + v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) + ts = v + } + + if !yield(ts) { + return + } + } } } } func (a *TimestampWithOffsetArray) Values() []time.Time { - values := make([]time.Time, a.Len()) - a.iterValues(func(i int, timestamp *time.Time) { - if timestamp == nil { - values[i] = time.Unix(0, 0) - } else { - values[i] = *timestamp - } - }) - return values + return slices.Collect(a.iterValues()) } func (a *TimestampWithOffsetArray) ValueStr(i int) string { @@ -363,9 +352,15 @@ func (a *TimestampWithOffsetArray) ValueStr(i int) string { func (a *TimestampWithOffsetArray) MarshalJSON() ([]byte, error) { values := make([]interface{}, a.Len()) - a.iterValues(func(i int, timestamp *time.Time) { - values[i] = timestamp - }) + i := 0 + for ts := range a.iterValues() { + if ts.Unix() == 0 { + values[i] = nil + } else { + values[i] = &ts + } + i += 1 + } return json.Marshal(values) } From 961057732004331c8fe4db97e7b4c3a9701d58da Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 30 Jan 2026 13:02:40 +0100 Subject: [PATCH 14/39] Test with `-08:30` and `11:00` timezone --- .../extensions/timestamp_with_offset_test.go | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index f7e6c278..05d68e97 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -34,13 +34,13 @@ import ( var testTimeUnit = arrow.Microsecond -var testDate1 = time.Date(2025, 01, 01, 00, 00, 00, 00, time.FixedZone("UTC+00:00", 0)) +var testDate0 = time.Date(2025, 01, 01, 00, 00, 00, 00, time.FixedZone("UTC+00:00", 0)) -var testZone1 = time.FixedZone("UTC-08:00", -8*60*60) -var testDate2 = testDate1.In(testZone1) +var testZone1 = time.FixedZone("UTC-08:30", -8*60*60 -30*60) +var testDate1 = testDate0.In(testZone1) -var testZone2 = time.FixedZone("UTC+06:00", +6*60*60) -var testDate3 = testDate1.In(testZone2) +var testZone2 = time.FixedZone("UTC+11:00", +11*60*60) +var testDate2 = testDate0.In(testZone2) func dict(index arrow.DataType) arrow.DataType { return &arrow.DictionaryType{ @@ -201,20 +201,20 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { // use that encoding (there is no way to pass a flag to array.FromJSON to say explicitly what storage type you want) primitiveBuilder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, arrow.PrimitiveTypes.Int16) assert.NoError(t, err) - primitiveBuilder.Append(testDate1) + primitiveBuilder.Append(testDate0) primitiveBuilder.AppendNull() + primitiveBuilder.Append(testDate1) primitiveBuilder.Append(testDate2) - primitiveBuilder.Append(testDate3) jsonComparisonArr := primitiveBuilder.NewArray() defer jsonComparisonArr.Release() for _, offsetType := range allAllowedOffsetTypes { builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) - builder.Append(testDate1) + builder.Append(testDate0) builder.AppendNull() + builder.Append(testDate1) builder.Append(testDate2) - builder.Append(testDate3) // it should build the array with the correct size arr := builder.NewArray() @@ -223,9 +223,9 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { defer arr.Release() // typedArr.Value(i) should return values adjusted for their original timezone - assert.Equal(t, testDate1, typedArr.Value(0)) - assert.Equal(t, testDate2, typedArr.Value(2)) - assert.Equal(t, testDate3, typedArr.Value(3)) + assert.Equal(t, testDate0, typedArr.Value(0)) + assert.Equal(t, testDate1, typedArr.Value(2)) + assert.Equal(t, testDate2, typedArr.Value(3)) // storage TimeUnit should be the same as we pass in to the builder, and storage timezone should be UTC timestampStructField := typedArr.Storage().(*array.Struct).Field(0) @@ -235,13 +235,13 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { // stored values should be equivalent to the raw values in UTC timestampsArr := timestampStructField.(*array.Timestamp) - assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) - assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) - assert.Equal(t, testDate3.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) + assert.Equal(t, testDate0.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) + assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) + assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) // the array should encode itself as JSON and string arrStr := arr.String() - assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s"]`, testDate1, testDate2, testDate3), arrStr) + assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s"]`, testDate0, testDate1, testDate2), arrStr) jsonStr, err := json.Marshal(arr) assert.NoError(t, err) @@ -270,23 +270,23 @@ func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { fieldBuilder := builder.Field(0).(*extensions.TimestampWithOffsetBuilder) // append a simple time.Time - fieldBuilder.Append(testDate1) + fieldBuilder.Append(testDate0) // append a null and 2 time.Time all at once values := []time.Time{ time.Unix(0, 0).In(time.UTC), + testDate1, testDate2, - testDate3, } valids := []bool{false, true, true} fieldBuilder.AppendValues(values, valids) // append a value from RFC3339 string - fieldBuilder.AppendValueFromString(testDate1.Format(time.RFC3339)) + fieldBuilder.AppendValueFromString(testDate0.Format(time.RFC3339)) // append value formatted in a different string layout fieldBuilder.Layout = time.RFC3339Nano - fieldBuilder.AppendValueFromString(testDate2.Format(time.RFC3339Nano)) + fieldBuilder.AppendValueFromString(testDate1.Format(time.RFC3339Nano)) record := builder.NewRecordBatch() @@ -295,10 +295,10 @@ func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { require.NoError(t, err) expect := `[{"timestamp_with_offset":"2025-01-01T00:00:00Z"} ,{"timestamp_with_offset":null} -,{"timestamp_with_offset":"2024-12-31T16:00:00-08:00"} -,{"timestamp_with_offset":"2025-01-01T06:00:00+06:00"} +,{"timestamp_with_offset":"2024-12-31T15:30:00-08:30"} +,{"timestamp_with_offset":"2025-01-01T11:00:00+11:00"} ,{"timestamp_with_offset":"2025-01-01T00:00:00Z"} -,{"timestamp_with_offset":"2024-12-31T16:00:00-08:00"} +,{"timestamp_with_offset":"2024-12-31T15:30:00-08:30"} ]` require.Equal(t, expect, string(json)) @@ -317,10 +317,10 @@ func TestTimestampWithOffsetTypeBatchIPCRoundTrip(t *testing.T) { for _, offsetType := range allAllowedOffsetTypes { builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) - builder.Append(testDate1) + builder.Append(testDate0) builder.AppendNull() + builder.Append(testDate1) builder.Append(testDate2) - builder.Append(testDate3) arr := builder.NewArray() defer arr.Release() From 8265d65ea503955db686f78316c37d4e098a7e59 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 30 Jan 2026 16:45:28 +0100 Subject: [PATCH 15/39] Rename constructors --- arrow/extensions/timestamp_with_offset.go | 29 +++++++++---------- .../extensions/timestamp_with_offset_test.go | 8 ++--- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 9cbe49d3..cea2f1ed 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -86,11 +86,20 @@ func isDataTypeCompatible(storageType arrow.DataType) (unit arrow.TimeUnit, offs return } +// NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Int16), where T is any TimeUnit. +func NewTimestampWithOffsetType(unit arrow.TimeUnit) *TimestampWithOffsetType { + v, _ := NewTimestampWithOffsetTypeCustomOffset(unit, arrow.PrimitiveTypes.Int16) + // SAFETY: This should never error as Int16 is always a valid offset type + + return v +} + // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=O), where T is any TimeUnit and O is a valid offset type. // // The error will be populated if the data type is not a valid encoding of the offsets field. -func NewTimestampWithOffsetType(unit arrow.TimeUnit, offsetType arrow.DataType) (*TimestampWithOffsetType, error) { +func NewTimestampWithOffsetTypeCustomOffset(unit arrow.TimeUnit, offsetType arrow.DataType) (*TimestampWithOffsetType, error) { if !isOffsetTypeOk(offsetType) { return nil, errors.New(fmt.Sprintf("Invalid offset type %s", offsetType)) } @@ -116,16 +125,6 @@ func NewTimestampWithOffsetType(unit arrow.TimeUnit, offsetType arrow.DataType) }, nil } - -// NewTimestampWithOffsetTypePrimitiveEncoded creates a new TimestampWithOffsetType with the underlying storage type set correctly to -// Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Int16), where T is any TimeUnit. -func NewTimestampWithOffsetTypePrimitiveEncoded(unit arrow.TimeUnit) *TimestampWithOffsetType { - v, _ := NewTimestampWithOffsetType(unit, arrow.PrimitiveTypes.Int16) - // SAFETY: This should never error as Int16 is always a valid offset type - - return v -} - // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Dictionary(I, Int16)), where T is any TimeUnit and I is a // valid Dictionary index type. @@ -137,7 +136,7 @@ func NewTimestampWithOffsetTypeDictionaryEncoded(unit arrow.TimeUnit, index arro ValueType: arrow.PrimitiveTypes.Int16, Ordered: false, } - return NewTimestampWithOffsetType(unit, &offsetType) + return NewTimestampWithOffsetTypeCustomOffset(unit, &offsetType) } // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to @@ -151,7 +150,7 @@ func NewTimestampWithOffsetTypeRunEndEncoded(unit arrow.TimeUnit, runEnds arrow. return nil, errors.New(fmt.Sprintf("Invalid run-ends type %s", runEnds)) } - return NewTimestampWithOffsetType(unit, offsetType) + return NewTimestampWithOffsetTypeCustomOffset(unit, offsetType) } @@ -177,7 +176,7 @@ func (b *TimestampWithOffsetType) Deserialize(storageType arrow.DataType, data s return nil, fmt.Errorf("invalid storage type for TimestampWithOffsetType: %s", storageType.Name()) } - return NewTimestampWithOffsetType(timeUnit, offsetType) + return NewTimestampWithOffsetTypeCustomOffset(timeUnit, offsetType) } func (b *TimestampWithOffsetType) ExtensionEquals(other arrow.ExtensionType) bool { @@ -387,7 +386,7 @@ type TimestampWithOffsetBuilder struct { // NewTimestampWithOffsetBuilder creates a new TimestampWithOffsetBuilder, exposing a convenient and efficient interface // for writing time.Time values to the underlying storage array. func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, offsetType arrow.DataType) (*TimestampWithOffsetBuilder, error) { - dataType, err := NewTimestampWithOffsetType(unit, offsetType) + dataType, err := NewTimestampWithOffsetTypeCustomOffset(unit, offsetType) if err != nil { return nil, err } diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 05d68e97..917ce9eb 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -79,7 +79,7 @@ var allAllowedOffsetTypes = []arrow.DataType{ } func TestTimestampWithOffsetTypePrimitiveBasics(t *testing.T) { - typ := extensions.NewTimestampWithOffsetTypePrimitiveEncoded(testTimeUnit) + typ := extensions.NewTimestampWithOffsetType(testTimeUnit) assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) assert.True(t, typ.ExtensionEquals(typ)) @@ -246,7 +246,7 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { assert.NoError(t, err) // roundtripping from JSON with array.FromJSON should work - expectedDataType, _ := extensions.NewTimestampWithOffsetType(testTimeUnit, offsetType) + expectedDataType, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) roundtripped, _, err := array.FromJSON(mem, expectedDataType, bytes.NewReader(jsonStr)) defer roundtripped.Release() assert.NoError(t, err) @@ -256,7 +256,7 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { for _, offsetType := range allAllowedOffsetTypes { - dataType, _ := extensions.NewTimestampWithOffsetType(testTimeUnit, offsetType) + dataType, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) schema := arrow.NewSchema([]arrow.Field{ { Name: "timestamp_with_offset", @@ -324,7 +324,7 @@ func TestTimestampWithOffsetTypeBatchIPCRoundTrip(t *testing.T) { arr := builder.NewArray() defer arr.Release() - typ, _ := extensions.NewTimestampWithOffsetType(testTimeUnit, offsetType) + typ, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) batch := array.NewRecordBatch(arrow.NewSchema([]arrow.Field{{Name: "timestamp_with_offset", Type: typ, Nullable: true}}, nil), []arrow.Array{arr}, -1) defer batch.Release() From a4e0baf20fe49a2adc826aa9156fdefead3fd28d Mon Sep 17 00:00:00 2001 From: serramatutu Date: Mon, 2 Feb 2026 14:27:04 +0100 Subject: [PATCH 16/39] Leverage generics for REE and dict-encoding --- arrow/extensions/timestamp_with_offset.go | 34 +++-- .../extensions/timestamp_with_offset_test.go | 144 ++++++++---------- 2 files changed, 90 insertions(+), 88 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index cea2f1ed..76eef4b4 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -125,32 +125,48 @@ func NewTimestampWithOffsetTypeCustomOffset(unit arrow.TimeUnit, offsetType arro }, nil } +type DictIndexType interface { + *arrow.Int8Type | *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type | + *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type +} + // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Dictionary(I, Int16)), where T is any TimeUnit and I is a // valid Dictionary index type. // // The error will be populated if the index is not a valid dictionary-encoding index type. -func NewTimestampWithOffsetTypeDictionaryEncoded(unit arrow.TimeUnit, index arrow.DataType) (*TimestampWithOffsetType, error) { +func NewTimestampWithOffsetTypeDictionaryEncoded[I DictIndexType](unit arrow.TimeUnit, index I) *TimestampWithOffsetType { offsetType := arrow.DictionaryType{ - IndexType: index, + IndexType: arrow.DataType(index), ValueType: arrow.PrimitiveTypes.Int16, Ordered: false, } - return NewTimestampWithOffsetTypeCustomOffset(unit, &offsetType) + v, _ := NewTimestampWithOffsetTypeCustomOffset(unit, &offsetType) + // SAFETY: This should never error as DictIndexType is always a valid index type + + return v +} + + +type TimestampWithOffsetRunEndsType interface { + *arrow.Int8Type | *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type | + *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type } + // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=RunEndEncoded(E, Int16)), where T is any TimeUnit and E is a // valid run-ends type. // // The error will be populated if runEnds is not a valid run-end encoding run-ends type. -func NewTimestampWithOffsetTypeRunEndEncoded(unit arrow.TimeUnit, runEnds arrow.DataType) (*TimestampWithOffsetType, error) { - offsetType := arrow.RunEndEncodedOf(runEnds, arrow.PrimitiveTypes.Int16) - if !offsetType.ValidRunEndsType(runEnds) { - return nil, errors.New(fmt.Sprintf("Invalid run-ends type %s", runEnds)) - } +func NewTimestampWithOffsetTypeRunEndEncoded[E TimestampWithOffsetRunEndsType](unit arrow.TimeUnit, runEnds E) *TimestampWithOffsetType { + offsetType := arrow.RunEndEncodedOf(arrow.DataType(runEnds), arrow.PrimitiveTypes.Int16) + + v, _ := NewTimestampWithOffsetTypeCustomOffset(unit, offsetType) + // SAFETY: This should never error as TimestampWithOffsetRunEndsType is always a valid run ends type + + return v - return NewTimestampWithOffsetTypeCustomOffset(unit, offsetType) } diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 917ce9eb..76eb3c8f 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -106,91 +106,77 @@ func TestTimestampWithOffsetTypePrimitiveBasics(t *testing.T) { assert.Equal(t, "extension", typ.String()) } -func TestTimestampWithOffsetTypeDictionaryEncodedBasics(t *testing.T) { - invalidIndexType := arrow.PrimitiveTypes.Float32 - _, err := extensions.NewTimestampWithOffsetTypeDictionaryEncoded(testTimeUnit, invalidIndexType) - assert.True(t, err != nil, "Err should not be nil if index type is invalid dict key") - - indexTypes := []arrow.DataType{ - arrow.PrimitiveTypes.Uint8, - arrow.PrimitiveTypes.Uint16, - arrow.PrimitiveTypes.Uint32, - arrow.PrimitiveTypes.Uint64, - arrow.PrimitiveTypes.Int8, - arrow.PrimitiveTypes.Int16, - arrow.PrimitiveTypes.Int32, - arrow.PrimitiveTypes.Int64, - }; - - for _, indexType := range indexTypes { - typ, err := extensions.NewTimestampWithOffsetTypeDictionaryEncoded(testTimeUnit, indexType) - assert.True(t, err == nil, "Err should be nil") - - assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) - assert.True(t, typ.ExtensionEquals(typ)) - - assert.True(t, arrow.TypeEqual(typ, typ)) - assert.True(t, arrow.TypeEqual( - arrow.StructOf( - arrow.Field{ - Name: "timestamp", - Type: &arrow.TimestampType{ - Unit: testTimeUnit, - TimeZone: "UTC", - }, - Nullable: false, - }, - arrow.Field{ - Name: "offset_minutes", - Type: dict(indexType), - Nullable: false, +func assertDictBasics[I extensions.DictIndexType](t *testing.T, indexType I) { + typ := extensions.NewTimestampWithOffsetTypeDictionaryEncoded(testTimeUnit, indexType) + + assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) + assert.True(t, typ.ExtensionEquals(typ)) + + assert.True(t, arrow.TypeEqual(typ, typ)) + assert.True(t, arrow.TypeEqual( + arrow.StructOf( + arrow.Field{ + Name: "timestamp", + Type: &arrow.TimestampType{ + Unit: testTimeUnit, + TimeZone: "UTC", }, - ), - typ.StorageType())) + Nullable: false, + }, + arrow.Field{ + Name: "offset_minutes", + Type: dict(arrow.DataType(indexType)), + Nullable: false, + }, + ), + typ.StorageType())) - assert.Equal(t, "extension", typ.String()) - } + assert.Equal(t, "extension", typ.String()) } -func TestTimestampWithOffsetTypeRunEndEncodedBasics(t *testing.T) { - invalidRunEndsType := arrow.PrimitiveTypes.Float32 - _, err := extensions.NewTimestampWithOffsetTypeRunEndEncoded(testTimeUnit, invalidRunEndsType) - assert.True(t, err != nil, "Err should not be nil if run ends type is invalid") - - runEndsTypes := []arrow.DataType{ - arrow.PrimitiveTypes.Int16, - arrow.PrimitiveTypes.Int32, - arrow.PrimitiveTypes.Int64, - }; - - for _, indexType := range runEndsTypes { - typ, err := extensions.NewTimestampWithOffsetTypeRunEndEncoded(testTimeUnit, indexType) - assert.True(t, err == nil, "Err should be nil") - - assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) - assert.True(t, typ.ExtensionEquals(typ)) - - assert.True(t, arrow.TypeEqual(typ, typ)) - assert.True(t, arrow.TypeEqual( - arrow.StructOf( - arrow.Field{ - Name: "timestamp", - Type: &arrow.TimestampType{ - Unit: testTimeUnit, - TimeZone: "UTC", - }, - Nullable: false, - }, - arrow.Field{ - Name: "offset_minutes", - Type: ree(indexType), - Nullable: false, +func TestTimestampWithOffsetTypeDictionaryEncodedBasics(t *testing.T) { + assertDictBasics(t, &arrow.Uint8Type{}) + assertDictBasics(t, &arrow.Uint16Type{}) + assertDictBasics(t, &arrow.Uint32Type{}) + assertDictBasics(t, &arrow.Uint64Type{}) + assertDictBasics(t, &arrow.Int8Type{}) + assertDictBasics(t, &arrow.Int16Type{}) + assertDictBasics(t, &arrow.Int32Type{}) + assertDictBasics(t, &arrow.Int64Type{}) +} + +func assertReeBasics[E extensions.TimestampWithOffsetRunEndsType](t *testing.T, runEndsType E) { + typ := extensions.NewTimestampWithOffsetTypeRunEndEncoded(testTimeUnit, runEndsType) + + assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) + assert.True(t, typ.ExtensionEquals(typ)) + + assert.True(t, arrow.TypeEqual(typ, typ)) + assert.True(t, arrow.TypeEqual( + arrow.StructOf( + arrow.Field{ + Name: "timestamp", + Type: &arrow.TimestampType{ + Unit: testTimeUnit, + TimeZone: "UTC", }, - ), - typ.StorageType())) + Nullable: false, + }, + arrow.Field{ + Name: "offset_minutes", + Type: ree(arrow.DataType(runEndsType)), + Nullable: false, + }, + ), + typ.StorageType())) - assert.Equal(t, "extension", typ.String()) - } + assert.Equal(t, "extension", typ.String()) +} + +func TestTimestampWithOffsetTypeRunEndEncodedBasics(t *testing.T) { + assertReeBasics(t, &arrow.Int16Type{}) + assertReeBasics(t, &arrow.Int32Type{}) + assertReeBasics(t, &arrow.Int64Type{}) } func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { From 708e46ee0c65acdf1411bbdcd22a6550076b60bc Mon Sep 17 00:00:00 2001 From: serramatutu Date: Tue, 3 Feb 2026 13:06:24 +0100 Subject: [PATCH 17/39] Make equals also check for data type --- arrow/extensions/timestamp_with_offset.go | 49 +++++++++---------- .../extensions/timestamp_with_offset_test.go | 42 +++++++++++----- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 76eef4b4..86b593a3 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -45,12 +45,12 @@ func isOffsetTypeOk(offsetType arrow.DataType) bool { case *arrow.DictionaryType: return arrow.IsInteger(offsetType.IndexType.ID()) && arrow.TypeEqual(offsetType.ValueType, arrow.PrimitiveTypes.Int16) case *arrow.RunEndEncodedType: - return offsetType.ValidRunEndsType(offsetType.RunEnds()) && + return offsetType.ValidRunEndsType(offsetType.RunEnds()) && arrow.TypeEqual(offsetType.Encoded(), arrow.PrimitiveTypes.Int16) - // FIXME: Technically this should be non-nullable, but a Arrow IPC does not deserialize - // ValueNullable properly, so enforcing this here would always fail when reading from an IPC - // stream - // !offsetType.ValueNullable + // FIXME: Technically this should be non-nullable, but a Arrow IPC does not deserialize + // ValueNullable properly, so enforcing this here would always fail when reading from an IPC + // stream + // !offsetType.ValueNullable default: return false } @@ -66,10 +66,10 @@ func isDataTypeCompatible(storageType arrow.DataType) (unit arrow.TimeUnit, offs st, compat := storageType.(*arrow.StructType) if !compat || st.NumFields() != 2 { - return + return } - if ts, compat := st.Field(0).Type.(*arrow.TimestampType); compat && ts.TimeZone == "UTC" { + if ts, compat := st.Field(0).Type.(*arrow.TimestampType); compat && ts.TimeZone == "UTC" { unit = ts.TimeUnit() } else { return @@ -126,8 +126,8 @@ func NewTimestampWithOffsetTypeCustomOffset(unit arrow.TimeUnit, offsetType arro } type DictIndexType interface { - *arrow.Int8Type | *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type | - *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type + *arrow.Int8Type | *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type | + *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type } // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to @@ -147,13 +147,11 @@ func NewTimestampWithOffsetTypeDictionaryEncoded[I DictIndexType](unit arrow.Tim return v } - type TimestampWithOffsetRunEndsType interface { - *arrow.Int8Type | *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type | - *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type + *arrow.Int8Type | *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type | + *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type } - // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=RunEndEncoded(E, Int16)), where T is any TimeUnit and E is a // valid run-ends type. @@ -169,7 +167,6 @@ func NewTimestampWithOffsetTypeRunEndEncoded[E TimestampWithOffsetRunEndsType](u } - func (b *TimestampWithOffsetType) ArrayType() reflect.Type { return reflect.TypeOf(TimestampWithOffsetArray{}) } @@ -196,7 +193,12 @@ func (b *TimestampWithOffsetType) Deserialize(storageType arrow.DataType, data s } func (b *TimestampWithOffsetType) ExtensionEquals(other arrow.ExtensionType) bool { - return b.ExtensionName() == other.ExtensionName() + return b.ExtensionName() == other.ExtensionName() && + arrow.TypeEqual(b.StorageType(), other.StorageType()) +} + +func (b *TimestampWithOffsetType) OffsetType() arrow.DataType { + return b.ExtensionBase.Storage.(*arrow.StructType).Field(1).Type } func (b *TimestampWithOffsetType) TimeUnit() arrow.TimeUnit { @@ -204,9 +206,7 @@ func (b *TimestampWithOffsetType) TimeUnit() arrow.TimeUnit { } func (b *TimestampWithOffsetType) NewBuilder(mem memory.Allocator) array.Builder { - v, _ := NewTimestampWithOffsetBuilder(mem, b.TimeUnit(), arrow.PrimitiveTypes.Int16) - // SAFETY: This will never error as Int16 is always a valid type for the offset field - + v, _ := NewTimestampWithOffsetBuilder(mem, b.TimeUnit(), b.OffsetType()) return v } @@ -295,7 +295,7 @@ func (a *TimestampWithOffsetArray) Value(i int) time.Time { // If the timestamp is null, the returned time will be the unix epoch. // // This will iterate using the fastest method given the underlying storage array -func (a* TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { +func (a *TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { return func(yield func(time.Time) bool) { structs := a.Storage().(*array.Struct) offsets := structs.Field(1) @@ -322,13 +322,13 @@ func (a* TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { offsetPhysicalIdx += 1 } - ts:= time.Unix(0, 0) + ts := time.Unix(0, 0) if a.IsValid(i) { utcTimestamp := timestamps.Value(i) offsetMinutes := offsetValues.Value(offsetPhysicalIdx) v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) ts = v - } + } if !yield(ts) { return @@ -336,12 +336,12 @@ func (a* TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { } } else { for i := 0; i < a.Len(); i++ { - ts:= time.Unix(0, 0) + ts := time.Unix(0, 0) if a.IsValid(i) { utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) ts = v - } + } if !yield(ts) { return @@ -351,7 +351,6 @@ func (a* TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { } } - func (a *TimestampWithOffsetArray) Values() []time.Time { return slices.Collect(a.iterValues()) } @@ -409,7 +408,7 @@ func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, of return &TimestampWithOffsetBuilder{ unit: unit, - offsetType: offsetType, + offsetType: offsetType, lastOffset: math.MaxInt16, Layout: time.RFC3339, ExtensionBuilder: array.NewExtensionBuilder(mem, dataType), diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 76eb3c8f..891a35aa 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -179,21 +179,39 @@ func TestTimestampWithOffsetTypeRunEndEncodedBasics(t *testing.T) { assertReeBasics(t, &arrow.Int64Type{}) } +func TestTimestampWithOffsetEquals(t *testing.T) { + // Completely different types are not equal + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewBool8Type())) + + // Different time units are not equal + // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) + // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Second))) + // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Second))) + // + // // Different underlying storage type is not equal + // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + // + // // Dict-encoding key type is not equal + // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + // + // // REE index type is not equal + // assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + // + // // Equals OK + // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Nanosecond))) + // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) + // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + // assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + // assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) +} + func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - // NOTE: we need to compare the arrays parsed from JSON with a primitive-encoded array, since that will always - // use that encoding (there is no way to pass a flag to array.FromJSON to say explicitly what storage type you want) - primitiveBuilder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, arrow.PrimitiveTypes.Int16) - assert.NoError(t, err) - primitiveBuilder.Append(testDate0) - primitiveBuilder.AppendNull() - primitiveBuilder.Append(testDate1) - primitiveBuilder.Append(testDate2) - jsonComparisonArr := primitiveBuilder.NewArray() - defer jsonComparisonArr.Release() - for _, offsetType := range allAllowedOffsetTypes { builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) @@ -236,7 +254,7 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { roundtripped, _, err := array.FromJSON(mem, expectedDataType, bytes.NewReader(jsonStr)) defer roundtripped.Release() assert.NoError(t, err) - assert.Truef(t, array.Equal(jsonComparisonArr, roundtripped), "expected %s\n\ngot %s", jsonComparisonArr, roundtripped) + assert.Truef(t, array.Equal(arr, roundtripped), "expected %s\n\ngot %s", arr, roundtripped) } } From d5725e1c3e42247f0c9ae537d17693ccccaf95d2 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 10 Apr 2026 14:44:21 -0700 Subject: [PATCH 18/39] Remove outdated docs --- arrow/extensions/timestamp_with_offset.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 86b593a3..cb0bde88 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -133,8 +133,6 @@ type DictIndexType interface { // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Dictionary(I, Int16)), where T is any TimeUnit and I is a // valid Dictionary index type. -// -// The error will be populated if the index is not a valid dictionary-encoding index type. func NewTimestampWithOffsetTypeDictionaryEncoded[I DictIndexType](unit arrow.TimeUnit, index I) *TimestampWithOffsetType { offsetType := arrow.DictionaryType{ IndexType: arrow.DataType(index), @@ -155,8 +153,6 @@ type TimestampWithOffsetRunEndsType interface { // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=RunEndEncoded(E, Int16)), where T is any TimeUnit and E is a // valid run-ends type. -// -// The error will be populated if runEnds is not a valid run-end encoding run-ends type. func NewTimestampWithOffsetTypeRunEndEncoded[E TimestampWithOffsetRunEndsType](unit arrow.TimeUnit, runEnds E) *TimestampWithOffsetType { offsetType := arrow.RunEndEncodedOf(arrow.DataType(runEnds), arrow.PrimitiveTypes.Int16) From 26c628c662d91c676df94caf5395d02538f46da7 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 10 Apr 2026 14:47:32 -0700 Subject: [PATCH 19/39] Remove unnecessary IsInteger call --- arrow/extensions/timestamp_with_offset.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index cb0bde88..6102c2e5 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -43,7 +43,7 @@ func isOffsetTypeOk(offsetType arrow.DataType) bool { case *arrow.Int16Type: return true case *arrow.DictionaryType: - return arrow.IsInteger(offsetType.IndexType.ID()) && arrow.TypeEqual(offsetType.ValueType, arrow.PrimitiveTypes.Int16) + return arrow.TypeEqual(offsetType.ValueType, arrow.PrimitiveTypes.Int16) case *arrow.RunEndEncodedType: return offsetType.ValidRunEndsType(offsetType.RunEnds()) && arrow.TypeEqual(offsetType.Encoded(), arrow.PrimitiveTypes.Int16) From 591763df1c74cfad14ddefb223843d2424c39f5b Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 10 Apr 2026 14:55:43 -0700 Subject: [PATCH 20/39] Remove specific type for `TimestampWithOffsetRunEndsType` --- arrow/extensions/timestamp_with_offset.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 6102c2e5..f0d43a79 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -145,19 +145,14 @@ func NewTimestampWithOffsetTypeDictionaryEncoded[I DictIndexType](unit arrow.Tim return v } -type TimestampWithOffsetRunEndsType interface { - *arrow.Int8Type | *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type | - *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type -} - // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=RunEndEncoded(E, Int16)), where T is any TimeUnit and E is a // valid run-ends type. -func NewTimestampWithOffsetTypeRunEndEncoded[E TimestampWithOffsetRunEndsType](unit arrow.TimeUnit, runEnds E) *TimestampWithOffsetType { +func NewTimestampWithOffsetTypeRunEndEncoded[E DictIndexType](unit arrow.TimeUnit, runEnds E) *TimestampWithOffsetType { offsetType := arrow.RunEndEncodedOf(arrow.DataType(runEnds), arrow.PrimitiveTypes.Int16) v, _ := NewTimestampWithOffsetTypeCustomOffset(unit, offsetType) - // SAFETY: This should never error as TimestampWithOffsetRunEndsType is always a valid run ends type + // SAFETY: This should never error as DictIndexType always a valid run ends type return v From f63340976d742d46247f398bc758cf2ccd9f27ae Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 10 Apr 2026 14:56:06 -0700 Subject: [PATCH 21/39] Uncomment test --- .../extensions/timestamp_with_offset_test.go | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 891a35aa..72b3be49 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -145,7 +145,7 @@ func TestTimestampWithOffsetTypeDictionaryEncodedBasics(t *testing.T) { assertDictBasics(t, &arrow.Int64Type{}) } -func assertReeBasics[E extensions.TimestampWithOffsetRunEndsType](t *testing.T, runEndsType E) { +func assertReeBasics[E extensions.DictIndexType](t *testing.T, runEndsType E) { typ := extensions.NewTimestampWithOffsetTypeRunEndEncoded(testTimeUnit, runEndsType) assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) @@ -184,28 +184,28 @@ func TestTimestampWithOffsetEquals(t *testing.T) { assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewBool8Type())) // Different time units are not equal - // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) - // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Second))) - // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Second))) - // - // // Different underlying storage type is not equal - // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) - // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) - // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) - // - // // Dict-encoding key type is not equal - // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) - // - // // REE index type is not equal - // assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) - // - // // Equals OK - // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Nanosecond))) - // assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) - // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) - // assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) - // assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) - // assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Second))) + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Second))) + + // Different underlying storage type is not equal + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + + // Dict-encoding key type is not equal + assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + + // REE index type is not equal + assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + + // Equals OK + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Nanosecond))) + assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) + assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) } func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { From 3ec66aca1845cd3ecc1942b59058398e14bbfef4 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 15 Apr 2026 11:53:32 +0200 Subject: [PATCH 22/39] Fix REE ends type --- arrow/extensions/timestamp_with_offset.go | 9 +++++-- .../extensions/timestamp_with_offset_test.go | 26 +++++++++---------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index f0d43a79..b566f7d1 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -130,6 +130,11 @@ type DictIndexType interface { *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type } + +type RunEndsType interface { + *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type +} + // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Dictionary(I, Int16)), where T is any TimeUnit and I is a // valid Dictionary index type. @@ -148,11 +153,11 @@ func NewTimestampWithOffsetTypeDictionaryEncoded[I DictIndexType](unit arrow.Tim // NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=RunEndEncoded(E, Int16)), where T is any TimeUnit and E is a // valid run-ends type. -func NewTimestampWithOffsetTypeRunEndEncoded[E DictIndexType](unit arrow.TimeUnit, runEnds E) *TimestampWithOffsetType { +func NewTimestampWithOffsetTypeRunEndEncoded[E RunEndsType](unit arrow.TimeUnit, runEnds E) *TimestampWithOffsetType { offsetType := arrow.RunEndEncodedOf(arrow.DataType(runEnds), arrow.PrimitiveTypes.Int16) v, _ := NewTimestampWithOffsetTypeCustomOffset(unit, offsetType) - // SAFETY: This should never error as DictIndexType always a valid run ends type + // SAFETY: This should never error as RunEndsType always a valid run ends type return v diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 72b3be49..188d01fa 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -46,7 +46,7 @@ func dict(index arrow.DataType) arrow.DataType { return &arrow.DictionaryType{ IndexType: index, ValueType: arrow.PrimitiveTypes.Int16, - Ordered: false, + Ordered: false, } } @@ -124,8 +124,8 @@ func assertDictBasics[I extensions.DictIndexType](t *testing.T, indexType I) { Nullable: false, }, arrow.Field{ - Name: "offset_minutes", - Type: dict(arrow.DataType(indexType)), + Name: "offset_minutes", + Type: dict(arrow.DataType(indexType)), Nullable: false, }, ), @@ -145,7 +145,7 @@ func TestTimestampWithOffsetTypeDictionaryEncodedBasics(t *testing.T) { assertDictBasics(t, &arrow.Int64Type{}) } -func assertReeBasics[E extensions.DictIndexType](t *testing.T, runEndsType E) { +func assertReeBasics[E extensions.RunEndsType](t *testing.T, runEndsType E) { typ := extensions.NewTimestampWithOffsetTypeRunEndEncoded(testTimeUnit, runEndsType) assert.Equal(t, "arrow.timestamp_with_offset", typ.ExtensionName()) @@ -163,8 +163,8 @@ func assertReeBasics[E extensions.DictIndexType](t *testing.T, runEndsType E) { Nullable: false, }, arrow.Field{ - Name: "offset_minutes", - Type: ree(arrow.DataType(runEndsType)), + Name: "offset_minutes", + Type: ree(arrow.DataType(runEndsType)), Nullable: false, }, ), @@ -197,15 +197,15 @@ func TestTimestampWithOffsetEquals(t *testing.T) { assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) // REE index type is not equal - assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int32Type{}))) // Equals OK - assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Nanosecond))) - assert.False(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) - assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) - assert.False(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) - assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) - assert.False(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + assert.True(t, extensions.NewTimestampWithOffsetType(arrow.Nanosecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Nanosecond))) + assert.True(t, extensions.NewTimestampWithOffsetType(arrow.Microsecond).ExtensionEquals(extensions.NewTimestampWithOffsetType(arrow.Microsecond))) + assert.True(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + assert.True(t, extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeDictionaryEncoded(arrow.Microsecond, &arrow.Uint16Type{}))) + assert.True(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int16Type{}))) + assert.True(t, extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int32Type{}).ExtensionEquals(extensions.NewTimestampWithOffsetTypeRunEndEncoded(arrow.Microsecond, &arrow.Int32Type{}))) } func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { From e427a2e7390a89463ffbecec29eec146fa18a871 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 15 Apr 2026 12:12:35 +0200 Subject: [PATCH 23/39] Support unix epoch Stop using `time=0` as a sentinel for null in `iterValues()`. Instead, return a tuple of `(time.Time, bool)` where the bool indicates whether the value is valid or not. Added a test for it. --- arrow/extensions/timestamp_with_offset.go | 34 ++++++++++++------- .../extensions/timestamp_with_offset_test.go | 16 +++++++-- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index b566f7d1..97135f16 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -280,7 +280,7 @@ func (a *TimestampWithOffsetArray) rawValueUnsafe(i int) (arrow.Timestamp, int16 func (a *TimestampWithOffsetArray) Value(i int) time.Time { if a.IsNull(i) { - return time.Unix(0, 0) + return time.Time{} } utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) return timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) @@ -288,11 +288,11 @@ func (a *TimestampWithOffsetArray) Value(i int) time.Time { // Iterates over the array returning the timestamp at each position. // -// If the timestamp is null, the returned time will be the unix epoch. +// The second parameter indicates whether the timestamp is valid or not. // // This will iterate using the fastest method given the underlying storage array -func (a *TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { - return func(yield func(time.Time) bool) { +func (a *TimestampWithOffsetArray) iterValues() iter.Seq2[time.Time, bool] { + return func(yield func(time.Time, bool) bool) { structs := a.Storage().(*array.Struct) offsets := structs.Field(1) if reeOffsets, isRee := offsets.(*array.RunEndEncoded); isRee { @@ -318,28 +318,30 @@ func (a *TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { offsetPhysicalIdx += 1 } - ts := time.Unix(0, 0) - if a.IsValid(i) { + var ts time.Time + valid := a.IsValid(i) + if valid { utcTimestamp := timestamps.Value(i) offsetMinutes := offsetValues.Value(offsetPhysicalIdx) v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) ts = v } - if !yield(ts) { + if !yield(ts, valid) { return } } } else { for i := 0; i < a.Len(); i++ { - ts := time.Unix(0, 0) - if a.IsValid(i) { + var ts time.Time + valid := a.IsValid(i) + if valid { utcTimestamp, offsetMinutes, timeUnit := a.rawValueUnsafe(i) v := timeFromFieldValues(utcTimestamp, offsetMinutes, timeUnit) ts = v } - if !yield(ts) { + if !yield(ts, valid) { return } } @@ -348,7 +350,13 @@ func (a *TimestampWithOffsetArray) iterValues() iter.Seq[time.Time] { } func (a *TimestampWithOffsetArray) Values() []time.Time { - return slices.Collect(a.iterValues()) + return slices.Collect(func (yield func (time.Time) bool) { + for time := range a.iterValues() { + if !yield(time) { + return + } + } + }) } func (a *TimestampWithOffsetArray) ValueStr(i int) string { @@ -363,8 +371,8 @@ func (a *TimestampWithOffsetArray) ValueStr(i int) string { func (a *TimestampWithOffsetArray) MarshalJSON() ([]byte, error) { values := make([]interface{}, a.Len()) i := 0 - for ts := range a.iterValues() { - if ts.Unix() == 0 { + for ts, valid := range a.iterValues() { + if !valid { values[i] = nil } else { values[i] = &ts diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 188d01fa..1af73a94 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -34,9 +34,11 @@ import ( var testTimeUnit = arrow.Microsecond +var epoch = time.Unix(0, 0).In(time.FixedZone("UTC+00:00", 0)) + var testDate0 = time.Date(2025, 01, 01, 00, 00, 00, 00, time.FixedZone("UTC+00:00", 0)) -var testZone1 = time.FixedZone("UTC-08:30", -8*60*60 -30*60) +var testZone1 = time.FixedZone("UTC-08:30", -8*60*60-30*60) var testDate1 = testDate0.In(testZone1) var testZone2 = time.FixedZone("UTC+11:00", +11*60*60) @@ -219,17 +221,19 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { builder.AppendNull() builder.Append(testDate1) builder.Append(testDate2) + builder.Append(epoch) // it should build the array with the correct size arr := builder.NewArray() typedArr := arr.(*extensions.TimestampWithOffsetArray) - assert.Equal(t, 4, arr.Data().Len()) + assert.Equal(t, 5, arr.Data().Len()) defer arr.Release() // typedArr.Value(i) should return values adjusted for their original timezone assert.Equal(t, testDate0, typedArr.Value(0)) assert.Equal(t, testDate1, typedArr.Value(2)) assert.Equal(t, testDate2, typedArr.Value(3)) + assert.Equal(t, epoch, typedArr.Value(4)) // storage TimeUnit should be the same as we pass in to the builder, and storage timezone should be UTC timestampStructField := typedArr.Storage().(*array.Struct).Field(0) @@ -242,10 +246,11 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { assert.Equal(t, testDate0.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) + assert.Equal(t, epoch.In(time.UTC), timestampsArr.Value(4).ToTime(testTimeUnit)) // the array should encode itself as JSON and string arrStr := arr.String() - assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s"]`, testDate0, testDate1, testDate2), arrStr) + assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s" "%[4]s"]`, testDate0, testDate1, testDate2, epoch), arrStr) jsonStr, err := json.Marshal(arr) assert.NoError(t, err) @@ -276,6 +281,9 @@ func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { // append a simple time.Time fieldBuilder.Append(testDate0) + // append the epoch + fieldBuilder.Append(epoch) + // append a null and 2 time.Time all at once values := []time.Time{ time.Unix(0, 0).In(time.UTC), @@ -298,6 +306,7 @@ func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { json, err := record.MarshalJSON() require.NoError(t, err) expect := `[{"timestamp_with_offset":"2025-01-01T00:00:00Z"} +,{"timestamp_with_offset":"1970-01-01T00:00:00Z"} ,{"timestamp_with_offset":null} ,{"timestamp_with_offset":"2024-12-31T15:30:00-08:30"} ,{"timestamp_with_offset":"2025-01-01T11:00:00+11:00"} @@ -325,6 +334,7 @@ func TestTimestampWithOffsetTypeBatchIPCRoundTrip(t *testing.T) { builder.AppendNull() builder.Append(testDate1) builder.Append(testDate2) + builder.Append(epoch) arr := builder.NewArray() defer arr.Release() From 3dffed25199dcdb95264924a214a859da05f6c8d Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 15 Apr 2026 12:33:43 +0200 Subject: [PATCH 24/39] Fix inconsistency with `UnsafeAppend()` --- arrow/extensions/timestamp_with_offset.go | 29 ++--------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 97135f16..327e91d3 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -445,31 +445,6 @@ func (b *TimestampWithOffsetBuilder) Append(v time.Time) { } -func (b *TimestampWithOffsetBuilder) UnsafeAppend(v time.Time) { - timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) - offsetMinutes16 := int16(offsetMinutes) - structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) - - structBuilder.Append(true) - structBuilder.FieldBuilder(0).(*array.TimestampBuilder).UnsafeAppend(timestamp) - - switch offsets := structBuilder.FieldBuilder(1).(type) { - case *array.Int16Builder: - offsets.UnsafeAppend(offsetMinutes16) - case *array.Int16DictionaryBuilder: - offsets.Append(offsetMinutes16) - case *array.RunEndEncodedBuilder: - if offsetMinutes != b.lastOffset { - offsets.Append(1) - offsets.ValueBuilder().(*array.Int16Builder).Append(offsetMinutes16) - } else { - offsets.ContinueRun(1) - } - - b.lastOffset = offsetMinutes16 - } -} - // By default, this will try to parse the string using the RFC3339 layout. // // You can change the default layout by using builder.SetLayout() @@ -514,7 +489,7 @@ func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []b timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) if valids[i] { timestamps.UnsafeAppend(timestamp) - offsets.Append(offsetMinutes) + offsets.UnsafeAppend(offsetMinutes) } else { timestamps.UnsafeAppendBoolToBitmap(false) offsets.UnsafeAppendBoolToBitmap(false) @@ -536,7 +511,7 @@ func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []b b.lastOffset = offsetMinutes16 } else { timestamps.UnsafeAppendBoolToBitmap(false) - offsets.UnsafeAppendBoolToBitmap(false) + offsets.AppendNull() } } } From 257793ea9d44d3307fe499e0a5b81915ede1b28a Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 15 Apr 2026 13:02:51 +0200 Subject: [PATCH 25/39] Stop appending nulls to child arrays --- arrow/extensions/timestamp_with_offset.go | 42 ++++++++--------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 327e91d3..b0a5899d 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -474,45 +474,31 @@ func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []b switch offsets := structBuilder.FieldBuilder(1).(type) { case *array.Int16Builder: - for i, v := range values { + for _, v := range values { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) - if valids[i] { - timestamps.UnsafeAppend(timestamp) - offsets.UnsafeAppend(offsetMinutes) - } else { - timestamps.UnsafeAppendBoolToBitmap(false) - offsets.UnsafeAppendBoolToBitmap(false) - } + timestamps.UnsafeAppend(timestamp) + offsets.UnsafeAppend(offsetMinutes) } case *array.Int16DictionaryBuilder: - for i, v := range values { + for _, v := range values { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) - if valids[i] { - timestamps.UnsafeAppend(timestamp) - offsets.UnsafeAppend(offsetMinutes) - } else { - timestamps.UnsafeAppendBoolToBitmap(false) - offsets.UnsafeAppendBoolToBitmap(false) - } + timestamps.UnsafeAppend(timestamp) + offsets.UnsafeAppend(offsetMinutes) } case *array.RunEndEncodedBuilder: offsetValuesBuilder := offsets.ValueBuilder().(*array.Int16Builder) for i, v := range values { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) - if valids[i] { - timestamps.UnsafeAppend(timestamp) - offsetMinutes16 := int16(offsetMinutes) - if offsetMinutes != b.lastOffset { - offsets.Append(1) - offsetValuesBuilder.Append(offsetMinutes16) - } else { - offsets.ContinueRun(1) - } - b.lastOffset = offsetMinutes16 + timestamps.UnsafeAppend(timestamp) + offsetMinutes16 := int16(offsetMinutes) + // If value at i is null, simply continue the run to maximize compression + if valids[i] && offsetMinutes != b.lastOffset { + offsets.Append(1) + offsetValuesBuilder.Append(offsetMinutes16) } else { - timestamps.UnsafeAppendBoolToBitmap(false) - offsets.AppendNull() + offsets.ContinueRun(1) } + b.lastOffset = offsetMinutes16 } } } From f225db38b5e726a8aec3f9a24249a18cc04b6425 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 15 Apr 2026 12:35:21 +0200 Subject: [PATCH 26/39] Fix lint --- arrow/extensions/timestamp_with_offset.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index b0a5899d..43779e41 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -17,7 +17,6 @@ package extensions import ( - "errors" "fmt" "iter" "math" @@ -95,13 +94,13 @@ func NewTimestampWithOffsetType(unit arrow.TimeUnit) *TimestampWithOffsetType { return v } -// NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// NewTimestampWithOffsetTypeCustomOffset creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=O), where T is any TimeUnit and O is a valid offset type. // // The error will be populated if the data type is not a valid encoding of the offsets field. func NewTimestampWithOffsetTypeCustomOffset(unit arrow.TimeUnit, offsetType arrow.DataType) (*TimestampWithOffsetType, error) { if !isOffsetTypeOk(offsetType) { - return nil, errors.New(fmt.Sprintf("Invalid offset type %s", offsetType)) + return nil, fmt.Errorf("invalid offset type %s", offsetType) } return &TimestampWithOffsetType{ @@ -130,12 +129,11 @@ type DictIndexType interface { *arrow.Uint8Type | *arrow.Uint16Type | *arrow.Uint32Type | *arrow.Uint64Type } - type RunEndsType interface { *arrow.Int16Type | *arrow.Int32Type | *arrow.Int64Type } -// NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// NewTimestampWithOffsetTypeDictionaryEncoded creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=Dictionary(I, Int16)), where T is any TimeUnit and I is a // valid Dictionary index type. func NewTimestampWithOffsetTypeDictionaryEncoded[I DictIndexType](unit arrow.TimeUnit, index I) *TimestampWithOffsetType { @@ -150,7 +148,7 @@ func NewTimestampWithOffsetTypeDictionaryEncoded[I DictIndexType](unit arrow.Tim return v } -// NewTimestampWithOffsetType creates a new TimestampWithOffsetType with the underlying storage type set correctly to +// NewTimestampWithOffsetTypeRunEndEncoded creates a new TimestampWithOffsetType with the underlying storage type set correctly to // Struct(timestamp=Timestamp(T, "UTC"), offset_minutes=RunEndEncoded(E, Int16)), where T is any TimeUnit and E is a // valid run-ends type. func NewTimestampWithOffsetTypeRunEndEncoded[E RunEndsType](unit arrow.TimeUnit, runEnds E) *TimestampWithOffsetType { @@ -350,7 +348,7 @@ func (a *TimestampWithOffsetArray) iterValues() iter.Seq2[time.Time, bool] { } func (a *TimestampWithOffsetArray) Values() []time.Time { - return slices.Collect(func (yield func (time.Time) bool) { + return slices.Collect(func(yield func(time.Time) bool) { for time := range a.iterValues() { if !yield(time) { return @@ -422,7 +420,7 @@ func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, of func (b *TimestampWithOffsetBuilder) Append(v time.Time) { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) offsetMinutes16 := int16(offsetMinutes) - structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) + structBuilder := b.Builder.(*array.StructBuilder) structBuilder.Append(true) structBuilder.FieldBuilder(0).(*array.TimestampBuilder).Append(timestamp) @@ -464,7 +462,7 @@ func (b *TimestampWithOffsetBuilder) AppendValueFromString(s string) error { } func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []bool) { - structBuilder := b.ExtensionBuilder.Builder.(*array.StructBuilder) + structBuilder := b.Builder.(*array.StructBuilder) timestamps := structBuilder.FieldBuilder(0).(*array.TimestampBuilder) structBuilder.AppendValues(valids) From 1a7412ebeb60b9b0ff41669da7f63e34d19eb55e Mon Sep 17 00:00:00 2001 From: serramatutu Date: Wed, 15 Apr 2026 13:10:38 +0200 Subject: [PATCH 27/39] Simplify `fieldValuesFromTime()` implementation --- arrow/extensions/timestamp_with_offset.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 43779e41..471a3117 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -239,15 +239,10 @@ func timeFromFieldValues(utcTimestamp arrow.Timestamp, offsetMinutes int16, unit } func fieldValuesFromTime(t time.Time, unit arrow.TimeUnit) (arrow.Timestamp, int16) { - // naive "bitwise" conversion to UTC, keeping the underlying date the same - utc := t.UTC() - naiveUtc := time.Date(t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC) - offsetMinutes := int16(naiveUtc.Sub(t).Minutes()) - - // SAFETY: unit MUST have been validated to a valid arrow.TimeUnit value before - // this function. Otherwise, ignoring this error is not safe. - timestamp, _ := arrow.TimestampFromTime(utc, unit) - return timestamp, offsetMinutes + _, offsetSeconds := t.Zone() + offsetMinutes := int16(offsetSeconds / 60) + ts, _ := arrow.TimestampFromTime(t.UTC(), unit) + return ts, offsetMinutes } // Get the raw arrow values at the given index From f58d662e0fd31b72c7551999ffd1e868ea3689db Mon Sep 17 00:00:00 2001 From: serramatutu Date: Thu, 30 Apr 2026 11:53:53 +0200 Subject: [PATCH 28/39] Add subtests --- .../extensions/timestamp_with_offset_test.go | 278 +++++++++--------- 1 file changed, 143 insertions(+), 135 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 1af73a94..897a641e 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -60,24 +60,26 @@ func ree(runEnds arrow.DataType) arrow.DataType { // All tests use this in a for loop to make sure everything works for every possible // encoding of offsets (primitive, dictionary, run-end) -var allAllowedOffsetTypes = []arrow.DataType{ +var allAllowedOffsetTypes = make(map[string]arrow.DataType) + +func init() { // primitive offsetType - arrow.PrimitiveTypes.Int16, + allAllowedOffsetTypes["primitive-int16"] = arrow.PrimitiveTypes.Int16 // dict-encoded offsetType - dict(arrow.PrimitiveTypes.Uint8), - dict(arrow.PrimitiveTypes.Uint16), - dict(arrow.PrimitiveTypes.Uint32), - dict(arrow.PrimitiveTypes.Uint64), - dict(arrow.PrimitiveTypes.Int8), - dict(arrow.PrimitiveTypes.Int16), - dict(arrow.PrimitiveTypes.Int32), - dict(arrow.PrimitiveTypes.Int64), + allAllowedOffsetTypes["dict-Uint8"] = dict(arrow.PrimitiveTypes.Uint8) + allAllowedOffsetTypes["dict-Uint16"] = dict(arrow.PrimitiveTypes.Uint16) + allAllowedOffsetTypes["dict-Uint32"] = dict(arrow.PrimitiveTypes.Uint32) + allAllowedOffsetTypes["dict-Uint64"] = dict(arrow.PrimitiveTypes.Uint64) + allAllowedOffsetTypes["dict-Int8"] = dict(arrow.PrimitiveTypes.Int8) + allAllowedOffsetTypes["dict-Int16"] = dict(arrow.PrimitiveTypes.Int16) + allAllowedOffsetTypes["dict-Int32"] = dict(arrow.PrimitiveTypes.Int32) + allAllowedOffsetTypes["dict-Int64"] = dict(arrow.PrimitiveTypes.Int64) // run-end encoded offsetType - ree(arrow.PrimitiveTypes.Int16), - ree(arrow.PrimitiveTypes.Int32), - ree(arrow.PrimitiveTypes.Int64), + allAllowedOffsetTypes["ree-Int16"] = ree(arrow.PrimitiveTypes.Int16) + allAllowedOffsetTypes["ree-Int32"] = ree(arrow.PrimitiveTypes.Int32) + allAllowedOffsetTypes["ree-Int64"] = ree(arrow.PrimitiveTypes.Int64) } func TestTimestampWithOffsetTypePrimitiveBasics(t *testing.T) { @@ -214,98 +216,101 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - for _, offsetType := range allAllowedOffsetTypes { - builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) - - builder.Append(testDate0) - builder.AppendNull() - builder.Append(testDate1) - builder.Append(testDate2) - builder.Append(epoch) - - // it should build the array with the correct size - arr := builder.NewArray() - typedArr := arr.(*extensions.TimestampWithOffsetArray) - assert.Equal(t, 5, arr.Data().Len()) - defer arr.Release() - - // typedArr.Value(i) should return values adjusted for their original timezone - assert.Equal(t, testDate0, typedArr.Value(0)) - assert.Equal(t, testDate1, typedArr.Value(2)) - assert.Equal(t, testDate2, typedArr.Value(3)) - assert.Equal(t, epoch, typedArr.Value(4)) - - // storage TimeUnit should be the same as we pass in to the builder, and storage timezone should be UTC - timestampStructField := typedArr.Storage().(*array.Struct).Field(0) - timestampStructDataType := timestampStructField.DataType().(*arrow.TimestampType) - assert.Equal(t, timestampStructDataType.Unit, testTimeUnit) - assert.Equal(t, timestampStructDataType.TimeZone, "UTC") - - // stored values should be equivalent to the raw values in UTC - timestampsArr := timestampStructField.(*array.Timestamp) - assert.Equal(t, testDate0.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) - assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) - assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) - assert.Equal(t, epoch.In(time.UTC), timestampsArr.Value(4).ToTime(testTimeUnit)) - - // the array should encode itself as JSON and string - arrStr := arr.String() - assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s" "%[4]s"]`, testDate0, testDate1, testDate2, epoch), arrStr) - jsonStr, err := json.Marshal(arr) - assert.NoError(t, err) - - // roundtripping from JSON with array.FromJSON should work - expectedDataType, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) - roundtripped, _, err := array.FromJSON(mem, expectedDataType, bytes.NewReader(jsonStr)) - defer roundtripped.Release() - assert.NoError(t, err) - assert.Truef(t, array.Equal(arr, roundtripped), "expected %s\n\ngot %s", arr, roundtripped) + for name, offsetType := range allAllowedOffsetTypes { + t.Run(name, func(t *testing.T) { + builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + + builder.Append(testDate0) + builder.AppendNull() + builder.Append(testDate1) + builder.Append(testDate2) + builder.Append(epoch) + + // it should build the array with the correct size + arr := builder.NewArray() + typedArr := arr.(*extensions.TimestampWithOffsetArray) + assert.Equal(t, 5, arr.Data().Len()) + defer arr.Release() + + // typedArr.Value(i) should return values adjusted for their original timezone + assert.Equal(t, testDate0, typedArr.Value(0)) + assert.Equal(t, testDate1, typedArr.Value(2)) + assert.Equal(t, testDate2, typedArr.Value(3)) + assert.Equal(t, epoch, typedArr.Value(4)) + + // storage TimeUnit should be the same as we pass in to the builder, and storage timezone should be UTC + timestampStructField := typedArr.Storage().(*array.Struct).Field(0) + timestampStructDataType := timestampStructField.DataType().(*arrow.TimestampType) + assert.Equal(t, timestampStructDataType.Unit, testTimeUnit) + assert.Equal(t, timestampStructDataType.TimeZone, "UTC") + + // stored values should be equivalent to the raw values in UTC + timestampsArr := timestampStructField.(*array.Timestamp) + assert.Equal(t, testDate0.In(time.UTC), timestampsArr.Value(0).ToTime(testTimeUnit)) + assert.Equal(t, testDate1.In(time.UTC), timestampsArr.Value(2).ToTime(testTimeUnit)) + assert.Equal(t, testDate2.In(time.UTC), timestampsArr.Value(3).ToTime(testTimeUnit)) + assert.Equal(t, epoch.In(time.UTC), timestampsArr.Value(4).ToTime(testTimeUnit)) + + // the array should encode itself as JSON and string + arrStr := arr.String() + assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[2]s" "%[3]s" "%[4]s"]`, testDate0, testDate1, testDate2, epoch), arrStr) + jsonStr, err := json.Marshal(arr) + assert.NoError(t, err) + + // roundtripping from JSON with array.FromJSON should work + expectedDataType, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) + roundtripped, _, err := array.FromJSON(mem, expectedDataType, bytes.NewReader(jsonStr)) + defer roundtripped.Release() + assert.NoError(t, err) + assert.Truef(t, array.Equal(arr, roundtripped), "expected %s\n\ngot %s", arr, roundtripped) + }) } } func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { - for _, offsetType := range allAllowedOffsetTypes { - dataType, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) - schema := arrow.NewSchema([]arrow.Field{ - { - Name: "timestamp_with_offset", - Nullable: true, - Type: dataType, - }, - }, nil) - builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) - defer builder.Release() + for name, offsetType := range allAllowedOffsetTypes { + t.Run(name, func(t *testing.T) { + dataType, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) + schema := arrow.NewSchema([]arrow.Field{ + { + Name: "timestamp_with_offset", + Nullable: true, + Type: dataType, + }, + }, nil) + builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + defer builder.Release() - fieldBuilder := builder.Field(0).(*extensions.TimestampWithOffsetBuilder) + fieldBuilder := builder.Field(0).(*extensions.TimestampWithOffsetBuilder) - // append a simple time.Time - fieldBuilder.Append(testDate0) + // append a simple time.Time + fieldBuilder.Append(testDate0) - // append the epoch - fieldBuilder.Append(epoch) + // append the epoch + fieldBuilder.Append(epoch) - // append a null and 2 time.Time all at once - values := []time.Time{ - time.Unix(0, 0).In(time.UTC), - testDate1, - testDate2, - } - valids := []bool{false, true, true} - fieldBuilder.AppendValues(values, valids) + // append a null and 2 time.Time all at once + values := []time.Time{ + time.Unix(0, 0).In(time.UTC), + testDate1, + testDate2, + } + valids := []bool{false, true, true} + fieldBuilder.AppendValues(values, valids) - // append a value from RFC3339 string - fieldBuilder.AppendValueFromString(testDate0.Format(time.RFC3339)) + // append a value from RFC3339 string + fieldBuilder.AppendValueFromString(testDate0.Format(time.RFC3339)) - // append value formatted in a different string layout - fieldBuilder.Layout = time.RFC3339Nano - fieldBuilder.AppendValueFromString(testDate1.Format(time.RFC3339Nano)) + // append value formatted in a different string layout + fieldBuilder.Layout = time.RFC3339Nano + fieldBuilder.AppendValueFromString(testDate1.Format(time.RFC3339Nano)) - record := builder.NewRecordBatch() + record := builder.NewRecordBatch() - // Record batch should JSON-encode values containing per-row timezone info - json, err := record.MarshalJSON() - require.NoError(t, err) - expect := `[{"timestamp_with_offset":"2025-01-01T00:00:00Z"} + // Record batch should JSON-encode values containing per-row timezone info + json, err := record.MarshalJSON() + require.NoError(t, err) + expect := `[{"timestamp_with_offset":"2025-01-01T00:00:00Z"} ,{"timestamp_with_offset":"1970-01-01T00:00:00Z"} ,{"timestamp_with_offset":null} ,{"timestamp_with_offset":"2024-12-31T15:30:00-08:30"} @@ -313,14 +318,15 @@ func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { ,{"timestamp_with_offset":"2025-01-01T00:00:00Z"} ,{"timestamp_with_offset":"2024-12-31T15:30:00-08:30"} ]` - require.Equal(t, expect, string(json)) - - // Record batch roundtrip to JSON should work - roundtripped, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(json)) - require.NoError(t, err) - defer roundtripped.Release() - require.Equal(t, schema, roundtripped.Schema()) - assert.Truef(t, array.RecordEqual(record, roundtripped), "expected %s\n\ngot %s", record, roundtripped) + require.Equal(t, expect, string(json)) + + // Record batch roundtrip to JSON should work + roundtripped, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(json)) + require.NoError(t, err) + defer roundtripped.Release() + require.Equal(t, schema, roundtripped.Schema()) + assert.Truef(t, array.RecordEqual(record, roundtripped), "expected %s\n\ngot %s", record, roundtripped) + }) } } @@ -328,41 +334,43 @@ func TestTimestampWithOffsetTypeBatchIPCRoundTrip(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - for _, offsetType := range allAllowedOffsetTypes { - builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) - builder.Append(testDate0) - builder.AppendNull() - builder.Append(testDate1) - builder.Append(testDate2) - builder.Append(epoch) - arr := builder.NewArray() - defer arr.Release() - - typ, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) - - batch := array.NewRecordBatch(arrow.NewSchema([]arrow.Field{{Name: "timestamp_with_offset", Type: typ, Nullable: true}}, nil), []arrow.Array{arr}, -1) - defer batch.Release() - - var written arrow.RecordBatch - { - var buf bytes.Buffer - wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema())) - require.NoError(t, wr.Write(batch)) - require.NoError(t, wr.Close()) - - rdr, err := ipc.NewReader(&buf) - require.NoError(t, err) - written, err = rdr.Read() - require.NoError(t, err) - written.Retain() - defer written.Release() - rdr.Release() - } + for name, offsetType := range allAllowedOffsetTypes { + t.Run(name, func(t *testing.T) { + builder, _ := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + builder.Append(testDate0) + builder.AppendNull() + builder.Append(testDate1) + builder.Append(testDate2) + builder.Append(epoch) + arr := builder.NewArray() + defer arr.Release() - assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s\n\ngot: %s", - batch.Schema(), written.Schema()) + typ, _ := extensions.NewTimestampWithOffsetTypeCustomOffset(testTimeUnit, offsetType) - assert.Truef(t, array.RecordEqual(batch, written), "expected: %s\n\ngot: %s", - batch, written) + batch := array.NewRecordBatch(arrow.NewSchema([]arrow.Field{{Name: "timestamp_with_offset", Type: typ, Nullable: true}}, nil), []arrow.Array{arr}, -1) + defer batch.Release() + + var written arrow.RecordBatch + { + var buf bytes.Buffer + wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema())) + require.NoError(t, wr.Write(batch)) + require.NoError(t, wr.Close()) + + rdr, err := ipc.NewReader(&buf) + require.NoError(t, err) + written, err = rdr.Read() + require.NoError(t, err) + written.Retain() + defer written.Release() + rdr.Release() + } + + assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s\n\ngot: %s", + batch.Schema(), written.Schema()) + + assert.Truef(t, array.RecordEqual(batch, written), "expected: %s\n\ngot: %s", + batch, written) + }) } } From b6f73a4aae73612d8925f1f3b11165d6eebf9ec0 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 13:58:11 -0400 Subject: [PATCH 29/39] Post-rebase fixes for CI: JSON int precision, GetOneForMarshal, TinyGo record.go/struct.go: call UseNumber() on the per-field sub-decoder in the nullable decode path so large int64 keeps full precision (regression against #816, surfaced by the rebase). json_reader_test.go: pass the nullable flag to the two-arg GetOneForMarshal. internal/json/json_stdlib.go: add IsNullMessage to the tinygo/stdlib json variant (it was only added to the goccy build), fixing the TinyGo 'undefined: json.IsNullMessage' example build. --- arrow/array/json_reader_test.go | 2 +- arrow/array/record.go | 1 + arrow/array/struct.go | 1 + internal/json/json_stdlib.go | 5 +++++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arrow/array/json_reader_test.go b/arrow/array/json_reader_test.go index 3d0def65..fc7cedea 100644 --- a/arrow/array/json_reader_test.go +++ b/arrow/array/json_reader_test.go @@ -253,7 +253,7 @@ func recordBatchToNDJSON(t *testing.T, rec arrow.RecordBatch) string { defer arr.Release() for pos := range arr.Len() { - s, err := json.Marshal(arr.GetOneForMarshal(pos)) + s, err := json.Marshal(arr.GetOneForMarshal(pos, true)) assert.NoError(t, err) sb.Write(s) sb.WriteByte('\n') diff --git a/arrow/array/record.go b/arrow/array/record.go index 2f8948d4..4b7b17bd 100644 --- a/arrow/array/record.go +++ b/arrow/array/record.go @@ -466,6 +466,7 @@ func (b *RecordBuilder) UnmarshalOne(dec *json.Decoder) error { b.fields[idx].AppendEmptyValue() } else { sub := json.NewDecoder(bytes.NewReader(next)) + sub.UseNumber() if err := b.fields[idx].UnmarshalOne(sub); err != nil { return err } diff --git a/arrow/array/struct.go b/arrow/array/struct.go index 63ccbf59..17f6152a 100644 --- a/arrow/array/struct.go +++ b/arrow/array/struct.go @@ -498,6 +498,7 @@ func (b *StructBuilder) UnmarshalOne(dec *json.Decoder) error { b.fields[idx].AppendEmptyValue() } else { sub := json.NewDecoder(bytes.NewReader(next)) + sub.UseNumber() if err := b.fields[idx].UnmarshalOne(sub); err != nil { return err } diff --git a/internal/json/json_stdlib.go b/internal/json/json_stdlib.go index 3031029d..06071fad 100644 --- a/internal/json/json_stdlib.go +++ b/internal/json/json_stdlib.go @@ -20,6 +20,7 @@ package json import ( + "bytes" "io" "encoding/json" @@ -49,3 +50,7 @@ func NewDecoder(r io.Reader) *Decoder { func NewEncoder(w io.Writer) *Encoder { return json.NewEncoder(w) } + +func IsNullMessage(m RawMessage) bool { + return bytes.Equal(m, []byte("null")) +} From 79cf1808699050bdee00dd7fc46aa311d5666d27 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 15:34:50 -0400 Subject: [PATCH 30/39] Make nested nullability field-local in comparison and union marshal A non-nullable list/union/struct can legitimately hold nullable children, so equality and JSON serialization must honor each child/element field's own nullability rather than propagating the parent's. Previously the parent nullability was pushed down, so null child slots were compared and serialized by their arbitrary underlying bytes -- which round-trips inconsistently and breaks cross-language integration (union, nested_large_offsets) even when values are logically equal. compare.go: unions and run-end-encoded arrays have no top-level validity bitmap, so skip their top-level null-count/validity checks (and the all-null shortcut); recurse into list/struct/map/union children with the child field's nullability. union.go: SparseUnion/DenseUnion GetOneForMarshal use the selected child field's nullability and emit [typeID, null] for a null child so it round-trips as null. --- arrow/array/compare.go | 55 +++++++++++++++++++++++++++++++++--------- arrow/array/union.go | 24 ++++++++++-------- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index ec56cb6d..07f475bf 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -253,7 +253,7 @@ func equal(left, right arrow.Array, opt equalOption) bool { return false case left.Len() == 0: return true - case opt.nullable && left.NullN() == left.Len(): + case opt.nullable && hasTopLevelValidityBitmap(left.DataType().ID()) && left.NullN() == left.Len(): return true } @@ -527,7 +527,7 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { return false case left.Len() == 0: return true - case opt.nullable && left.NullN() == left.Len(): + case opt.nullable && hasTopLevelValidityBitmap(left.DataType().ID()) && left.NullN() == left.Len(): return true } @@ -678,14 +678,34 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { } } +func withNullable(opt equalOption, nullable bool) equalOption { + opt.nullable = nullable + return opt +} + +// hasTopLevelValidityBitmap reports whether arrays of the given type id carry +// their own top-level validity bitmap. Union and run-end-encoded arrays do not: +// their nullness is encoded entirely in their children, so top-level null-count +// and validity comparisons are not meaningful for them and can legitimately +// differ between logically-equal arrays (e.g. built from JSON vs. read from IPC). +func hasTopLevelValidityBitmap(id arrow.Type) bool { + switch id { + case arrow.SPARSE_UNION, arrow.DENSE_UNION, arrow.RUN_END_ENCODED: + return false + } + return true +} + func baseArrayEqual(left, right arrow.Array, opt equalOption) bool { switch { case left.Len() != right.Len(): return false - case opt.nullable && left.NullN() != right.NullN(): - return false case !arrow.TypeEqual(left.DataType(), right.DataType()): // We do not check for metadata as in the C++ implementation. return false + case !hasTopLevelValidityBitmap(left.DataType().ID()): + return true + case opt.nullable && left.NullN() != right.NullN(): + return false case opt.nullable && !validityBitmapEqual(left, right): return false } @@ -779,6 +799,7 @@ func arrayApproxEqualFloat64(left, right *Float64, opt equalOption) bool { } func arrayApproxEqualList(left, right *List, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -788,7 +809,7 @@ func arrayApproxEqualList(left, right *List, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return arrayApproxEqual(l, r, opt) + return arrayApproxEqual(l, r, childOpt) }() if !o { return false @@ -798,6 +819,7 @@ func arrayApproxEqualList(left, right *List, opt equalOption) bool { } func arrayApproxEqualLargeList(left, right *LargeList, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -807,7 +829,7 @@ func arrayApproxEqualLargeList(left, right *LargeList, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return arrayApproxEqual(l, r, opt) + return arrayApproxEqual(l, r, childOpt) }() if !o { return false @@ -817,6 +839,7 @@ func arrayApproxEqualLargeList(left, right *LargeList, opt equalOption) bool { } func arrayApproxEqualListView(left, right *ListView, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -826,7 +849,7 @@ func arrayApproxEqualListView(left, right *ListView, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return arrayApproxEqual(l, r, opt) + return arrayApproxEqual(l, r, childOpt) }() if !o { return false @@ -836,6 +859,7 @@ func arrayApproxEqualListView(left, right *ListView, opt equalOption) bool { } func arrayApproxEqualLargeListView(left, right *LargeListView, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -845,7 +869,7 @@ func arrayApproxEqualLargeListView(left, right *LargeListView, opt equalOption) defer l.Release() r := right.newListValue(i) defer r.Release() - return arrayApproxEqual(l, r, opt) + return arrayApproxEqual(l, r, childOpt) }() if !o { return false @@ -855,6 +879,7 @@ func arrayApproxEqualLargeListView(left, right *LargeListView, opt equalOption) } func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -864,7 +889,7 @@ func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) defer l.Release() r := right.newListValue(i) defer r.Release() - return arrayApproxEqual(l, r, opt) + return arrayApproxEqual(l, r, childOpt) }() if !o { return false @@ -886,9 +911,11 @@ func arrayApproxEqualStruct(left, right *Struct, opt equalOption) bool { } func approxEqualStructRun(left, right *Struct, opt equalOption) bitutils.VisitFn { + st := left.DataType().(*arrow.StructType) return func(pos int64, length int64) error { for i := range left.fields { - if !sliceApproxEqual(left.fields[i], pos, pos+length, right.fields[i], pos, pos+length, opt) { + childOpt := withNullable(opt, st.Field(i).Nullable) + if !sliceApproxEqual(left.fields[i], pos, pos+length, right.fields[i], pos, pos+length, childOpt) { return arrow.ErrInvalid } } @@ -928,6 +955,10 @@ func arrayApproxEqualSingleMapEntry(left, right *Struct, opt equalOption) bool { return true } + st := left.DataType().(*arrow.StructType) + keyOpt := withNullable(opt, st.Field(0).Nullable) + valOpt := withNullable(opt, st.Field(1).Nullable) + used := make(map[int]bool, right.Len()) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { @@ -948,12 +979,12 @@ func arrayApproxEqualSingleMapEntry(left, right *Struct, opt equalOption) bool { rBeg, rEnd := int64(j), int64(j+1) // check keys (field 0) - if !sliceApproxEqual(left.Field(0), lBeg, lEnd, right.Field(0), rBeg, rEnd, opt) { + if !sliceApproxEqual(left.Field(0), lBeg, lEnd, right.Field(0), rBeg, rEnd, keyOpt) { continue } // only now check the values - if sliceApproxEqual(left.Field(1), lBeg, lEnd, right.Field(1), rBeg, rEnd, opt) { + if sliceApproxEqual(left.Field(1), lBeg, lEnd, right.Field(1), rBeg, rEnd, valOpt) { found = true used[j] = true break diff --git a/arrow/array/union.go b/arrow/array/union.go index 1d40ad66..c10e9964 100644 --- a/arrow/array/union.go +++ b/arrow/array/union.go @@ -320,17 +320,18 @@ func (a *SparseUnion) setData(data *Data) { debug.Assert(a.data.buffers[0] == nil, "arrow/array: validity bitmap for sparse unions should be nil") } -func (a *SparseUnion) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *SparseUnion) GetOneForMarshal(i int, nullable bool) any { typeID := a.RawTypeCodes()[i] childID := a.ChildID(i) data := a.Field(childID) - if nullable && data.IsNull(i) { - return nil + childNullable := a.unionType.Fields()[childID].Nullable + if childNullable && data.IsNull(i) { + return []any{typeID, nil} } - return []interface{}{typeID, data.GetOneForMarshal(i, nullable)} + return []any{typeID, data.GetOneForMarshal(i, childNullable)} } func (a *SparseUnion) MarshalJSON() ([]byte, error) { @@ -471,8 +472,9 @@ func arraySparseUnionApproxEqual(l, r *SparseUnion, opt equalOption) bool { } childNum := childIDs[typeID] + childOpt := withNullable(opt, l.unionType.Fields()[childNum].Nullable) eq := sliceApproxEqual(l.children[childNum], int64(i+l.data.offset), int64(i+l.data.offset+1), - r.children[childNum], int64(i+r.data.offset), int64(i+r.data.offset+1), opt) + r.children[childNum], int64(i+r.data.offset), int64(i+r.data.offset+1), childOpt) if !eq { return false } @@ -613,18 +615,19 @@ func (a *DenseUnion) setData(data *Data) { } } -func (a *DenseUnion) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *DenseUnion) GetOneForMarshal(i int, nullable bool) any { typeID := a.RawTypeCodes()[i] childID := a.ChildID(i) data := a.Field(childID) offset := int(a.RawValueOffsets()[i]) - if nullable && data.IsNull(offset) { - return nil + childNullable := a.unionType.Fields()[childID].Nullable + if childNullable && data.IsNull(offset) { + return []any{typeID, nil} } - return []interface{}{typeID, data.GetOneForMarshal(offset, nullable)} + return []any{typeID, data.GetOneForMarshal(offset, childNullable)} } func (a *DenseUnion) MarshalJSON() ([]byte, error) { @@ -715,8 +718,9 @@ func arrayDenseUnionApproxEqual(l, r *DenseUnion, opt equalOption) bool { } childNum := childIDs[typeID] + childOpt := withNullable(opt, l.unionType.Fields()[childNum].Nullable) eq := sliceApproxEqual(l.children[childNum], int64(leftOffsets[i]), int64(leftOffsets[i]+1), - r.children[childNum], int64(rightOffsets[i]), int64(rightOffsets[i]+1), opt) + r.children[childNum], int64(rightOffsets[i]), int64(rightOffsets[i]+1), childOpt) if !eq { return false } From 6f92ce2c3d52b71c6b9dd5a3884ad370f30aaf5e Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 15:53:33 -0400 Subject: [PATCH 31/39] Make exact Equal and chunked comparison field-local too Follow-up to the field-local nullability change (addresses review of 79cf1808): the exact Equal path and the chunked/table helpers still used parent/top-level nullability. arrayEqualList/LargeList/ListView/LargeListView and arrayEqualFixedSizeList now recurse into elements with the element field's nullability (arrayEqualStruct was already field-local; arrayEqualMap delegates to arrayEqualList). chunkedEqual and chunkedApproxEqual now skip the top-level NullN check for unions and run-end-encoded arrays via hasTopLevelValidityBitmap, so Table comparisons don't reject logically-equal union/REE columns. --- arrow/array/compare.go | 4 ++-- arrow/array/fixed_size_list.go | 3 ++- arrow/array/list.go | 12 ++++++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index 07f475bf..3c9cdc8e 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -138,7 +138,7 @@ func chunkedEqual(left, right *arrow.Chunked, opt equalOption) bool { return true case left.Len() != right.Len(): return false - case opt.nullable && left.NullN() != right.NullN(): + case opt.nullable && hasTopLevelValidityBitmap(left.DataType().ID()) && left.NullN() != right.NullN(): return false case !arrow.TypeEqual(left.DataType(), right.DataType()): return false @@ -165,7 +165,7 @@ func chunkedApproxEqual(left, right *arrow.Chunked, opt equalOption) bool { return true case left.Len() != right.Len(): return false - case opt.nullable && left.NullN() != right.NullN(): + case opt.nullable && hasTopLevelValidityBitmap(left.DataType().ID()) && left.NullN() != right.NullN(): return false case !arrow.TypeEqual(left.DataType(), right.DataType()): return false diff --git a/arrow/array/fixed_size_list.go b/arrow/array/fixed_size_list.go index 10dce4c7..d4ddfe8f 100644 --- a/arrow/array/fixed_size_list.go +++ b/arrow/array/fixed_size_list.go @@ -85,6 +85,7 @@ func (a *FixedSizeList) setData(data *Data) { } func arrayEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -94,7 +95,7 @@ func arrayEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return equal(l, r, opt) + return equal(l, r, childOpt) }() if !o { return false diff --git a/arrow/array/list.go b/arrow/array/list.go index 16eac56f..a777eab6 100644 --- a/arrow/array/list.go +++ b/arrow/array/list.go @@ -130,6 +130,7 @@ func (a *List) MarshalJSON() ([]byte, error) { } func arrayEqualList(left, right *List, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -139,7 +140,7 @@ func arrayEqualList(left, right *List, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return equal(l, r, opt) + return equal(l, r, childOpt) }() if !o { return false @@ -262,6 +263,7 @@ func (a *LargeList) MarshalJSON() ([]byte, error) { } func arrayEqualLargeList(left, right *LargeList, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -271,7 +273,7 @@ func arrayEqualLargeList(left, right *LargeList, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return equal(l, r, opt) + return equal(l, r, childOpt) }() if !o { return false @@ -737,6 +739,7 @@ func (a *ListView) MarshalJSON() ([]byte, error) { } func arrayEqualListView(left, right *ListView, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -746,7 +749,7 @@ func arrayEqualListView(left, right *ListView, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return equal(l, r, opt) + return equal(l, r, childOpt) }() if !o { return false @@ -884,6 +887,7 @@ func (a *LargeListView) MarshalJSON() ([]byte, error) { } func arrayEqualLargeListView(left, right *LargeListView, opt equalOption) bool { + childOpt := withNullable(opt, left.DataType().(arrow.ListLikeType).ElemField().Nullable) for i := 0; i < left.Len(); i++ { if opt.nullable && left.IsNull(i) { continue @@ -893,7 +897,7 @@ func arrayEqualLargeListView(left, right *LargeListView, opt equalOption) bool { defer l.Release() r := right.newListValue(i) defer r.Release() - return equal(l, r, opt) + return equal(l, r, childOpt) }() if !o { return false From 778d8d77a6d11cc2f4d9198e8d4e2c59575d010f Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 16:02:00 -0400 Subject: [PATCH 32/39] Preserve nullability option in exact chunked comparison Addresses review of 6f92ce2c: chunkedEqual called exported SliceEqual, which rebuilds default options and drops the caller's nullable setting, so TableEqual/ChunkedEqual with WithNullable(false) re-compared each chunk slice as nullable. Call the private sliceEqual(..., opt) instead, matching chunkedApproxEqual. --- arrow/array/compare.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index 3c9cdc8e..f30f1a62 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -146,7 +146,7 @@ func chunkedEqual(left, right *arrow.Chunked, opt equalOption) bool { var isequal = true chunkedBinaryApply(left, right, func(left arrow.Array, lbeg, lend int64, right arrow.Array, rbeg, rend int64) bool { - isequal = SliceEqual(left, lbeg, lend, right, rbeg, rend) + isequal = sliceEqual(left, lbeg, lend, right, rbeg, rend, opt) return isequal }) From 1befc5ee90807659da4e53c7d26aa701700ecd17 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 18:57:05 -0400 Subject: [PATCH 33/39] Address review feedback on nullability-aware comparison and REE offsets - array/compare.go: read the right-hand field from right.Schema() (not left.Schema()) in recordEqual, recordApproxEqual, tableEqual, and tableApproxEqual, so field/schema differences (name, nullability, metadata) between the two sides are actually detected instead of a field being compared against itself. - extensions/timestamp_with_offset.go: fix the run-end-encoded offset builder in AppendValues. Start a run when none exists yet (e.g. leading null rows) instead of continuing a nonexistent run, and only update lastOffset when a new run starts. This stops a leading null from corrupting the run-ends/values children and stops a null between equal offsets from splitting one contiguous run into two. Add a noLastOffset sentinel and handle nil valids. - array/record_test.go: fix a malformed JSON map-entry literal (missing comma) in TestRecordBuilder. - ipc/metadata_test.go: TestUnrecognizedExtensionType now compares against the real preserved extension metadata (name arrow.uuid, empty serialized value). The previous placeholder only passed because record comparison ignored field metadata before the compare.go fix. - extensions: add regression tests for leading-null AppendValues across all offset encodings and for a null between equal offsets keeping a single run-end-encoded run. --- arrow/array/compare.go | 8 +-- arrow/array/record_test.go | 2 +- arrow/extensions/timestamp_with_offset.go | 29 ++++++++--- .../extensions/timestamp_with_offset_test.go | 51 +++++++++++++++++++ arrow/ipc/metadata_test.go | 2 +- 5 files changed, 79 insertions(+), 13 deletions(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index f30f1a62..b34c025a 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -40,7 +40,7 @@ func recordEqual(left, right arrow.RecordBatch, opt equalOption) bool { for i := range left.Columns() { lf := left.Schema().Field(i) - rf := left.Schema().Field(i) + rf := right.Schema().Field(i) if !lf.Equal(rf) { return false } @@ -72,7 +72,7 @@ func recordApproxEqual(left, right arrow.RecordBatch, opt equalOption) bool { for i := range left.Columns() { lf := left.Schema().Field(i) - rf := left.Schema().Field(i) + rf := right.Schema().Field(i) if !lf.Equal(rf) { return false } @@ -195,7 +195,7 @@ func tableEqual(left, right arrow.Table, opt equalOption) bool { for i := 0; int64(i) < left.NumCols(); i++ { lf := left.Schema().Field(i) - rf := left.Schema().Field(i) + rf := right.Schema().Field(i) if !lf.Equal(rf) { return false } @@ -226,7 +226,7 @@ func tableApproxEqual(left, right arrow.Table, opt equalOption) bool { for i := 0; int64(i) < left.NumCols(); i++ { lf := left.Schema().Field(i) - rf := left.Schema().Field(i) + rf := right.Schema().Field(i) if !lf.Equal(rf) { return false } diff --git a/arrow/array/record_test.go b/arrow/array/record_test.go index eabb84f3..d07325c7 100644 --- a/arrow/array/record_test.go +++ b/arrow/array/record_test.go @@ -516,7 +516,7 @@ func TestRecordBuilder(t *testing.T) { } } - err := b.UnmarshalJSON([]byte(`{"f1-i32": 6, "f2-f64-notnull": 6.6, "map": [{"key": "4": "value": "d"}]}`)) + err := b.UnmarshalJSON([]byte(`{"f1-i32": 6, "f2-f64-notnull": 6.6, "map": [{"key": "4", "value": "d"}]}`)) assert.NoError(t, err) err = b.UnmarshalJSON([]byte(`{"f1-i32": null, "f2-f64-notnull": null, "map": null}`)) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 471a3117..b74da41d 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -382,6 +382,12 @@ func (a *TimestampWithOffsetArray) GetOneForMarshal(i int, nullable bool) interf return a.Value(i) } +// noLastOffset is the sentinel value for TimestampWithOffsetBuilder.lastOffset +// indicating that no run-end-encoded run has been started yet. It is deliberately +// outside the range of valid timezone offsets in minutes (roughly [-720, 840]) so +// it can never compare equal to a real offset. +const noLastOffset int16 = math.MaxInt16 + // TimestampWithOffsetBuilder is a convenience builder for the TimestampWithOffset extension type, // allowing arrays to be built with boolean values rather than the underlying storage type. type TimestampWithOffsetBuilder struct { @@ -391,7 +397,9 @@ type TimestampWithOffsetBuilder struct { Layout string unit arrow.TimeUnit offsetType arrow.DataType - // lastOffset is only used to determine when to start new runs with run-end encoded offsets + // lastOffset tracks the offset of the current run when the offsets are run-end + // encoded, so we know when to start a new run. It is initialized to noLastOffset + // to signal that no run has been started yet. lastOffset int16 } @@ -406,7 +414,7 @@ func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, of return &TimestampWithOffsetBuilder{ unit: unit, offsetType: offsetType, - lastOffset: math.MaxInt16, + lastOffset: noLastOffset, Layout: time.RFC3339, ExtensionBuilder: array.NewExtensionBuilder(mem, dataType), }, nil @@ -483,15 +491,22 @@ func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []b for i, v := range values { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) timestamps.UnsafeAppend(timestamp) - offsetMinutes16 := int16(offsetMinutes) - // If value at i is null, simply continue the run to maximize compression - if valids[i] && offsetMinutes != b.lastOffset { + + // A null row's offset is masked by the struct validity bitmap, so we + // continue the current run to maximize compression. A run must still be + // started when none exists yet (e.g. leading null rows), otherwise + // ContinueRun would advance the run-ends without a matching entry in the + // values child and produce an invalid run-end encoded array. lastOffset + // is only updated when a new run starts, so a null row never splits a + // contiguous run into two adjacent runs sharing the same value. + valid := valids == nil || valids[i] + if b.lastOffset == noLastOffset || (valid && offsetMinutes != b.lastOffset) { offsets.Append(1) - offsetValuesBuilder.Append(offsetMinutes16) + offsetValuesBuilder.Append(offsetMinutes) + b.lastOffset = offsetMinutes } else { offsets.ContinueRun(1) } - b.lastOffset = offsetMinutes16 } } } diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 897a641e..ed2e5e3c 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -267,6 +267,57 @@ func TestTimestampWithOffsetExtensionBuilder(t *testing.T) { } } +func TestTimestampWithOffsetBuilderAppendValuesLeadingNull(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + for name, offsetType := range allAllowedOffsetTypes { + t.Run(name, func(t *testing.T) { + builder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + require.NoError(t, err) + + // A leading null as the very first operation on a fresh builder must not + // corrupt the offsets. Reading every value below would panic on a + // run-end encoded array whose run-ends and values children fell out of + // sync (which happened when a run was continued before one existed). + builder.AppendValues([]time.Time{epoch, testDate1, testDate2}, []bool{false, true, true}) + + arr := builder.NewArray() + defer arr.Release() + typedArr := arr.(*extensions.TimestampWithOffsetArray) + + require.Equal(t, 3, arr.Data().Len()) + assert.True(t, typedArr.IsNull(0)) + assert.Equal(t, testDate1, typedArr.Value(1)) + assert.Equal(t, testDate2, typedArr.Value(2)) + }) + } +} + +func TestTimestampWithOffsetBuilderRunEndEncodedNullContinuesRun(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + builder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, ree(arrow.PrimitiveTypes.Int16)) + require.NoError(t, err) + + // The same offset surrounds a null. The null must continue the current run + // rather than reset the tracked offset, otherwise one logical run is split + // into two adjacent runs that share the same value. + builder.AppendValues([]time.Time{testDate1, epoch, testDate1}, []bool{true, false, true}) + + arr := builder.NewArray() + defer arr.Release() + typedArr := arr.(*extensions.TimestampWithOffsetArray) + + offsets := typedArr.Storage().(*array.Struct).Field(1).(*array.RunEndEncoded) + assert.Equal(t, 1, offsets.Values().Len()) + + assert.Equal(t, testDate1, typedArr.Value(0)) + assert.True(t, typedArr.IsNull(1)) + assert.Equal(t, testDate1, typedArr.Value(2)) +} + func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { for name, offsetType := range allAllowedOffsetTypes { t.Run(name, func(t *testing.T) { diff --git a/arrow/ipc/metadata_test.go b/arrow/ipc/metadata_test.go index ac8820dc..eb72f972 100644 --- a/arrow/ipc/metadata_test.go +++ b/arrow/ipc/metadata_test.go @@ -257,7 +257,7 @@ func TestUnrecognizedExtensionType(t *testing.T) { // create a record batch with the same data, but the field should contain the // extension metadata and be of the storage type instead of being the extension type. - extMetadata := arrow.NewMetadata([]string{ExtensionTypeKeyName, ExtensionMetadataKeyName}, []string{"uuid", "uuid-serialized"}) + extMetadata := arrow.NewMetadata([]string{ExtensionTypeKeyName, ExtensionMetadataKeyName}, []string{"arrow.uuid", ""}) batchNoExt := array.NewRecordBatch( arrow.NewSchema([]arrow.Field{ {Name: "f0", Type: storageArr.DataType(), Nullable: true, Metadata: extMetadata}, From 5b96785142e1449ed0f7240d47fb27aadb0e6d1f Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 19:06:25 -0400 Subject: [PATCH 34/39] Fix TimestampWithOffset builder nil validity and reuse handling Address roborev review findings on the run-end-encoded offset builder: - AppendValues now treats a nil valids slice as all-valid so the parent struct and its child builders stay the same length. Previously the struct received zero slots while the children received len(values), producing an inconsistent extension array. - Override NewArray/NewExtensionArray to reset lastOffset to noLastOffset on finalization, so a reused builder starts a fresh run instead of continuing a run that belonged to the array just finalized (which could emit run-ends without a matching value on a run-end-encoded offset). - Add regression tests for nil validity and builder reuse across all offset encodings. --- arrow/extensions/timestamp_with_offset.go | 24 ++++++++- .../extensions/timestamp_with_offset_test.go | 54 +++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index b74da41d..7276fa95 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -420,6 +420,21 @@ func NewTimestampWithOffsetBuilder(mem memory.Allocator, unit arrow.TimeUnit, of }, nil } +// NewArray must route through this type's NewExtensionArray (not the embedded +// ExtensionBuilder's) so that the run-end-encoding tracker is reset on reuse. +func (b *TimestampWithOffsetBuilder) NewArray() arrow.Array { + return b.NewExtensionArray() +} + +// NewExtensionArray finalizes the current array and resets lastOffset so a +// reused builder starts a fresh run instead of continuing a run that belonged +// to the array just finalized (the underlying REE builder is reset too). +func (b *TimestampWithOffsetBuilder) NewExtensionArray() array.ExtensionArray { + arr := b.ExtensionBuilder.NewExtensionArray() + b.lastOffset = noLastOffset + return arr +} + func (b *TimestampWithOffsetBuilder) Append(v time.Time) { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) offsetMinutes16 := int16(offsetMinutes) @@ -465,6 +480,13 @@ func (b *TimestampWithOffsetBuilder) AppendValueFromString(s string) error { } func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []bool) { + if valids == nil { + valids = make([]bool, len(values)) + for i := range valids { + valids[i] = true + } + } + structBuilder := b.Builder.(*array.StructBuilder) timestamps := structBuilder.FieldBuilder(0).(*array.TimestampBuilder) @@ -499,7 +521,7 @@ func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []b // values child and produce an invalid run-end encoded array. lastOffset // is only updated when a new run starts, so a null row never splits a // contiguous run into two adjacent runs sharing the same value. - valid := valids == nil || valids[i] + valid := valids[i] if b.lastOffset == noLastOffset || (valid && offsetMinutes != b.lastOffset) { offsets.Append(1) offsetValuesBuilder.Append(offsetMinutes) diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index ed2e5e3c..df5ea1a9 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -318,6 +318,60 @@ func TestTimestampWithOffsetBuilderRunEndEncodedNullContinuesRun(t *testing.T) { assert.Equal(t, testDate1, typedArr.Value(2)) } +func TestTimestampWithOffsetBuilderAppendValuesNilValids(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + for name, offsetType := range allAllowedOffsetTypes { + t.Run(name, func(t *testing.T) { + builder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + require.NoError(t, err) + + // nil validity must be treated as all-valid so the parent struct and + // its child builders stay the same length. + builder.AppendValues([]time.Time{testDate0, testDate1, testDate2}, nil) + + arr := builder.NewArray() + defer arr.Release() + typedArr := arr.(*extensions.TimestampWithOffsetArray) + + require.Equal(t, 3, arr.Data().Len()) + assert.Equal(t, testDate0, typedArr.Value(0)) + assert.Equal(t, testDate1, typedArr.Value(1)) + assert.Equal(t, testDate2, typedArr.Value(2)) + }) + } +} + +func TestTimestampWithOffsetBuilderReuseAfterNewArray(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + for name, offsetType := range allAllowedOffsetTypes { + t.Run(name, func(t *testing.T) { + builder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + require.NoError(t, err) + + builder.Append(testDate1) + arr1 := builder.NewArray() + defer arr1.Release() + + // Reusing the builder must start a fresh run even when the first value + // repeats the previous array's offset; otherwise a run-end-encoded + // offset is continued on a freshly reset (empty) builder. + builder.Append(testDate1) + builder.Append(testDate2) + arr2 := builder.NewArray() + defer arr2.Release() + + typedArr := arr2.(*extensions.TimestampWithOffsetArray) + require.Equal(t, 2, arr2.Data().Len()) + assert.Equal(t, testDate1, typedArr.Value(0)) + assert.Equal(t, testDate2, typedArr.Value(1)) + }) + } +} + func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { for name, offsetType := range allAllowedOffsetTypes { t.Run(name, func(t *testing.T) { From 51dcf3fe3afe5781edb1d27bd80f9b5bf5b0fa5a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 19:10:59 -0400 Subject: [PATCH 35/39] Treat empty validity slice as all-valid in TimestampWithOffset builder Follow-up to the nil-validity fix: an empty (len 0) valids slice is also all-valid, matching the convention of the other Arrow builders. AppendValues now normalizes both nil and empty validity, and panics on a genuine length mismatch (a non-empty valids whose length differs from values). Extend the validity regression test to cover the empty slice and add a test asserting the length-mismatch panic. --- arrow/extensions/timestamp_with_offset.go | 5 ++- .../extensions/timestamp_with_offset_test.go | 42 ++++++++++++------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 7276fa95..f01adbf7 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -480,7 +480,10 @@ func (b *TimestampWithOffsetBuilder) AppendValueFromString(s string) error { } func (b *TimestampWithOffsetBuilder) AppendValues(values []time.Time, valids []bool) { - if valids == nil { + if len(valids) != len(values) && len(valids) != 0 { + panic("len(values) != len(valids) && len(valids) != 0") + } + if len(valids) == 0 { valids = make([]bool, len(values)) for i := range valids { valids[i] = true diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index df5ea1a9..3403aa27 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -322,27 +322,41 @@ func TestTimestampWithOffsetBuilderAppendValuesNilValids(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) + // nil and empty validity slices must both be treated as all-valid so the + // parent struct and its child builders stay the same length. + validsVariants := map[string][]bool{"nil": nil, "empty": {}} + for name, offsetType := range allAllowedOffsetTypes { - t.Run(name, func(t *testing.T) { - builder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) - require.NoError(t, err) + for validsName, valids := range validsVariants { + t.Run(name+"/"+validsName, func(t *testing.T) { + builder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, offsetType) + require.NoError(t, err) - // nil validity must be treated as all-valid so the parent struct and - // its child builders stay the same length. - builder.AppendValues([]time.Time{testDate0, testDate1, testDate2}, nil) + builder.AppendValues([]time.Time{testDate0, testDate1, testDate2}, valids) - arr := builder.NewArray() - defer arr.Release() - typedArr := arr.(*extensions.TimestampWithOffsetArray) + arr := builder.NewArray() + defer arr.Release() + typedArr := arr.(*extensions.TimestampWithOffsetArray) - require.Equal(t, 3, arr.Data().Len()) - assert.Equal(t, testDate0, typedArr.Value(0)) - assert.Equal(t, testDate1, typedArr.Value(1)) - assert.Equal(t, testDate2, typedArr.Value(2)) - }) + require.Equal(t, 3, arr.Data().Len()) + assert.Equal(t, testDate0, typedArr.Value(0)) + assert.Equal(t, testDate1, typedArr.Value(1)) + assert.Equal(t, testDate2, typedArr.Value(2)) + }) + } } } +func TestTimestampWithOffsetBuilderAppendValuesMismatchedValidsPanics(t *testing.T) { + builder, err := extensions.NewTimestampWithOffsetBuilder(memory.DefaultAllocator, testTimeUnit, arrow.PrimitiveTypes.Int16) + require.NoError(t, err) + defer builder.Release() + + assert.Panics(t, func() { + builder.AppendValues([]time.Time{testDate0, testDate1}, []bool{true}) + }) +} + func TestTimestampWithOffsetBuilderReuseAfterNewArray(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) From 212e4288202852c6a1a1b838a82a0ea4ff171c65 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 19:19:27 -0400 Subject: [PATCH 36/39] Reset REE run tracker on TimestampWithOffset builder null paths Follow-up to the run-end-encoding fixes: AppendNull (and therefore the UnmarshalOne(null) and AppendValueFromString(null) paths that call it) appends a null offset run via the embedded struct builder but left lastOffset stale. A subsequent value repeating the pre-null offset would then continue the null run instead of starting its own, encoding that value with the null offset run. Override AppendNull/AppendNulls to reset lastOffset to noLastOffset, and add a regression test for Append(value), AppendNull(), Append(same-offset value) with run-end-encoded offsets. --- arrow/extensions/timestamp_with_offset.go | 13 ++++++++++ .../extensions/timestamp_with_offset_test.go | 24 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index f01adbf7..0f85a20a 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -435,6 +435,19 @@ func (b *TimestampWithOffsetBuilder) NewExtensionArray() array.ExtensionArray { return arr } +// AppendNull resets the run-end-encoding tracker: the embedded struct builder +// appends a null offset run, so a following value must start a new run instead +// of continuing the null run. +func (b *TimestampWithOffsetBuilder) AppendNull() { + b.ExtensionBuilder.AppendNull() + b.lastOffset = noLastOffset +} + +func (b *TimestampWithOffsetBuilder) AppendNulls(n int) { + b.ExtensionBuilder.AppendNulls(n) + b.lastOffset = noLastOffset +} + func (b *TimestampWithOffsetBuilder) Append(v time.Time) { timestamp, offsetMinutes := fieldValuesFromTime(v, b.unit) offsetMinutes16 := int16(offsetMinutes) diff --git a/arrow/extensions/timestamp_with_offset_test.go b/arrow/extensions/timestamp_with_offset_test.go index 3403aa27..a2484710 100644 --- a/arrow/extensions/timestamp_with_offset_test.go +++ b/arrow/extensions/timestamp_with_offset_test.go @@ -386,6 +386,30 @@ func TestTimestampWithOffsetBuilderReuseAfterNewArray(t *testing.T) { } } +func TestTimestampWithOffsetBuilderAppendNullBetweenEqualOffsets(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + builder, err := extensions.NewTimestampWithOffsetBuilder(mem, testTimeUnit, ree(arrow.PrimitiveTypes.Int16)) + require.NoError(t, err) + + // AppendNull appends a null offset run; the following value repeats the + // pre-null offset and must start its own run instead of being folded into + // the null run (which would decode the trailing value's offset as null). + builder.Append(testDate1) + builder.AppendNull() + builder.Append(testDate1) + + arr := builder.NewArray() + defer arr.Release() + typedArr := arr.(*extensions.TimestampWithOffsetArray) + + require.Equal(t, 3, arr.Data().Len()) + assert.Equal(t, testDate1, typedArr.Value(0)) + assert.True(t, typedArr.IsNull(1)) + assert.Equal(t, testDate1, typedArr.Value(2)) +} + func TestTimestampWithOffsetExtensionRecordBuilder(t *testing.T) { for name, offsetType := range allAllowedOffsetTypes { t.Run(name, func(t *testing.T) { From a8b9af498fe01ff2ee86470d01c48a1fe982f927 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 21:51:44 -0400 Subject: [PATCH 37/39] Compare record/table fields ignoring metadata The right-schema comparison fix routed RecordEqual/RecordApproxEqual/ TableEqual/TableApproxEqual through Field.Equal, which also compares field and type metadata. That rejected logically-equal records differing only in metadata -- for example a PARQUET:field_id attached during a parquet round-trip -- breaking parquet/pqarrow tests such as TestForceLargeTypes. Introduce fieldEqualIgnoringMetadata (name + nullability + metadata- insensitive TypeEqual) and use it in the four comparison functions. It still detects the nullability differences the schema check was added for, while restoring the metadata-insensitive behavior these APIs had before. This makes the earlier metadata_test.go expected-metadata tweak unnecessary, so revert it. --- arrow/array/compare.go | 17 +++++++++++++---- arrow/ipc/metadata_test.go | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index b34c025a..77dd7655 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -25,6 +25,15 @@ import ( "github.com/apache/arrow-go/v18/internal/bitutils" ) +// fieldEqualIgnoringMetadata reports whether two fields describe the same column +// for record/table equality: same name, nullability, and type. Field and type +// metadata are intentionally ignored so records that differ only in metadata +// (for example a PARQUET:field_id attached during a round-trip) still compare as +// equal, preserving the pre-existing behavior of these comparison APIs. +func fieldEqualIgnoringMetadata(l, r arrow.Field) bool { + return l.Name == r.Name && l.Nullable == r.Nullable && arrow.TypeEqual(l.Type, r.Type) +} + // RecordEqual reports whether the two provided records are equal. func RecordEqual(left, right arrow.RecordBatch, opts ...EqualOption) bool { return recordEqual(left, right, newEqualOption(opts...)) @@ -41,7 +50,7 @@ func recordEqual(left, right arrow.RecordBatch, opt equalOption) bool { for i := range left.Columns() { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !lf.Equal(rf) { + if !fieldEqualIgnoringMetadata(lf, rf) { return false } @@ -73,7 +82,7 @@ func recordApproxEqual(left, right arrow.RecordBatch, opt equalOption) bool { for i := range left.Columns() { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !lf.Equal(rf) { + if !fieldEqualIgnoringMetadata(lf, rf) { return false } @@ -196,7 +205,7 @@ func tableEqual(left, right arrow.Table, opt equalOption) bool { for i := 0; int64(i) < left.NumCols(); i++ { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !lf.Equal(rf) { + if !fieldEqualIgnoringMetadata(lf, rf) { return false } @@ -227,7 +236,7 @@ func tableApproxEqual(left, right arrow.Table, opt equalOption) bool { for i := 0; int64(i) < left.NumCols(); i++ { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !lf.Equal(rf) { + if !fieldEqualIgnoringMetadata(lf, rf) { return false } diff --git a/arrow/ipc/metadata_test.go b/arrow/ipc/metadata_test.go index eb72f972..ac8820dc 100644 --- a/arrow/ipc/metadata_test.go +++ b/arrow/ipc/metadata_test.go @@ -257,7 +257,7 @@ func TestUnrecognizedExtensionType(t *testing.T) { // create a record batch with the same data, but the field should contain the // extension metadata and be of the storage type instead of being the extension type. - extMetadata := arrow.NewMetadata([]string{ExtensionTypeKeyName, ExtensionMetadataKeyName}, []string{"arrow.uuid", ""}) + extMetadata := arrow.NewMetadata([]string{ExtensionTypeKeyName, ExtensionMetadataKeyName}, []string{"uuid", "uuid-serialized"}) batchNoExt := array.NewRecordBatch( arrow.NewSchema([]arrow.Field{ {Name: "f0", Type: storageArr.DataType(), Nullable: true, Metadata: extMetadata}, From 9b3a302a7c37ddb3548a40a61b5a577bb3177fcd Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 2 Jul 2026 22:01:10 -0400 Subject: [PATCH 38/39] Address roborev findings on record/table field comparison - compare.go: drop the arrow.TypeEqual call from the schema-field check. TypeEqual is not metadata-insensitive for large-list / list-view element types (they fall through to reflect.DeepEqual or Field.Equal), so relying on it could still reject records that differ only in child-field metadata. Field type and values are already validated per column by baseArrayEqual, so the schema-field check now compares only name and nullability (renamed schemaFieldEqual). - ipc/metadata_test.go: use the real preserved extension metadata (arrow.uuid, empty serialized value) and assert on the read-back field metadata explicitly, since record comparison no longer inspects field metadata. --- arrow/array/compare.go | 22 ++++++++++++---------- arrow/ipc/metadata_test.go | 5 ++++- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arrow/array/compare.go b/arrow/array/compare.go index 77dd7655..348df500 100644 --- a/arrow/array/compare.go +++ b/arrow/array/compare.go @@ -25,13 +25,15 @@ import ( "github.com/apache/arrow-go/v18/internal/bitutils" ) -// fieldEqualIgnoringMetadata reports whether two fields describe the same column -// for record/table equality: same name, nullability, and type. Field and type -// metadata are intentionally ignored so records that differ only in metadata +// schemaFieldEqual reports whether two fields agree on the schema-level +// attributes that record/table equality must match between the two sides: name +// and nullability. The field type and values are validated per column by the +// array comparison (baseArrayEqual), so they are not re-checked here, and field +// metadata is intentionally not compared so records that differ only in metadata // (for example a PARQUET:field_id attached during a round-trip) still compare as -// equal, preserving the pre-existing behavior of these comparison APIs. -func fieldEqualIgnoringMetadata(l, r arrow.Field) bool { - return l.Name == r.Name && l.Nullable == r.Nullable && arrow.TypeEqual(l.Type, r.Type) +// equal, matching the pre-existing behavior of these comparison APIs. +func schemaFieldEqual(l, r arrow.Field) bool { + return l.Name == r.Name && l.Nullable == r.Nullable } // RecordEqual reports whether the two provided records are equal. @@ -50,7 +52,7 @@ func recordEqual(left, right arrow.RecordBatch, opt equalOption) bool { for i := range left.Columns() { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !fieldEqualIgnoringMetadata(lf, rf) { + if !schemaFieldEqual(lf, rf) { return false } @@ -82,7 +84,7 @@ func recordApproxEqual(left, right arrow.RecordBatch, opt equalOption) bool { for i := range left.Columns() { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !fieldEqualIgnoringMetadata(lf, rf) { + if !schemaFieldEqual(lf, rf) { return false } @@ -205,7 +207,7 @@ func tableEqual(left, right arrow.Table, opt equalOption) bool { for i := 0; int64(i) < left.NumCols(); i++ { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !fieldEqualIgnoringMetadata(lf, rf) { + if !schemaFieldEqual(lf, rf) { return false } @@ -236,7 +238,7 @@ func tableApproxEqual(left, right arrow.Table, opt equalOption) bool { for i := 0; int64(i) < left.NumCols(); i++ { lf := left.Schema().Field(i) rf := right.Schema().Field(i) - if !fieldEqualIgnoringMetadata(lf, rf) { + if !schemaFieldEqual(lf, rf) { return false } diff --git a/arrow/ipc/metadata_test.go b/arrow/ipc/metadata_test.go index ac8820dc..63e700b8 100644 --- a/arrow/ipc/metadata_test.go +++ b/arrow/ipc/metadata_test.go @@ -257,12 +257,15 @@ func TestUnrecognizedExtensionType(t *testing.T) { // create a record batch with the same data, but the field should contain the // extension metadata and be of the storage type instead of being the extension type. - extMetadata := arrow.NewMetadata([]string{ExtensionTypeKeyName, ExtensionMetadataKeyName}, []string{"uuid", "uuid-serialized"}) + extMetadata := arrow.NewMetadata([]string{ExtensionTypeKeyName, ExtensionMetadataKeyName}, []string{"arrow.uuid", ""}) batchNoExt := array.NewRecordBatch( arrow.NewSchema([]arrow.Field{ {Name: "f0", Type: storageArr.DataType(), Nullable: true, Metadata: extMetadata}, }, nil), []arrow.Array{storageArr}, 4) defer batchNoExt.Release() + // RecordEqual ignores field metadata, so explicitly verify the unrecognized + // extension metadata is preserved on the read-back field. + assert.Truef(t, rec.Schema().Field(0).Metadata.Equal(extMetadata), "expected metadata %v, got %v", extMetadata, rec.Schema().Field(0).Metadata) assert.Truef(t, array.RecordEqual(rec, batchNoExt), "expected: %s\ngot: %s\n", batchNoExt, rec) } From 03b049f34b62d61163bcba63520e0e27ced3aafb Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Sat, 4 Jul 2026 13:44:04 -0400 Subject: [PATCH 39/39] Avoid breaking arrow.Array with optional NullableMarshaler GetOneForMarshal previously grew a nullable bool parameter on the public arrow.Array interface, breaking external callers and custom Array implementations. Restore the single-argument GetOneForMarshal(i int) and move the field-local nullability behavior to a new optional exported interface, arrow.NullableMarshaler, with GetOneForMarshalNullable(i int, nullable bool). Containers (struct, sparse/dense union, dictionary, run-end-encoded, extension) and RecordToJSON propagate each field's Nullable flag through an unexported getOneForMarshalNullable helper that type-asserts to NullableMarshaler and falls back to GetOneForMarshal for arrays that do not implement it, preserving the previous validity-bitmap behavior. ExtensionArrayBase deliberately does not implement NullableMarshaler so that the optional interface is never promoted onto embedding extension arrays and cannot bypass a concrete type's GetOneForMarshal override. The helper instead handles the one field-local case that matters for plain extension arrays: a null slot in a non-nullable field serializes the storage value rather than JSON null, while still honoring any custom GetOneForMarshal that returns a non-null logical value. --- arrow/array.go | 17 ++++++++- arrow/array/binary.go | 24 +++++++++---- arrow/array/boolean.go | 6 +++- arrow/array/decimal.go | 10 ++++-- arrow/array/decimal128_test.go | 2 +- arrow/array/decimal256_test.go | 2 +- arrow/array/dictionary.go | 11 +++--- arrow/array/encoded.go | 17 ++++++--- arrow/array/extension.go | 9 +++-- arrow/array/fixed_size_list.go | 8 +++-- arrow/array/fixedsize_binary.go | 6 +++- arrow/array/float16.go | 6 +++- arrow/array/interval.go | 22 +++++++++--- arrow/array/json_reader_test.go | 2 +- arrow/array/list.go | 40 ++++++++++++++------- arrow/array/null.go | 6 +++- arrow/array/numeric_generic.go | 44 +++++++++++++++++------ arrow/array/string.go | 22 +++++++++--- arrow/array/struct.go | 12 ++++--- arrow/array/timestamp.go | 8 +++-- arrow/array/union.go | 28 +++++++++------ arrow/array/util.go | 30 +++++++++++++++- arrow/extensions/bool8.go | 6 +++- arrow/extensions/json.go | 8 +++-- arrow/extensions/timestamp_with_offset.go | 6 +++- arrow/extensions/uuid.go | 8 +++-- arrow/extensions/variant.go | 6 +++- 27 files changed, 280 insertions(+), 86 deletions(-) diff --git a/arrow/array.go b/arrow/array.go index 891697e9..09fafe42 100644 --- a/arrow/array.go +++ b/arrow/array.go @@ -111,7 +111,7 @@ type Array interface { ValueStr(i int) string // Get single value to be marshalled with `json.Marshal` - GetOneForMarshal(i int, nullable bool) interface{} + GetOneForMarshal(i int) interface{} Data() ArrayData @@ -128,6 +128,21 @@ type Array interface { Release() } +// NullableMarshaler is an optional interface an Array may implement to control +// whether a value is rendered as JSON null based on the field-local nullability +// from the schema, rather than solely on the array's own validity bitmap. +// +// It exists so containers (struct/union/dictionary/run-end-encoded/extension) +// and RecordToJSON can propagate each field's Nullable flag down to its values +// without a breaking change to the Array interface. Arrays that do not implement +// it fall back to GetOneForMarshal, preserving the previous validity-based behavior. +type NullableMarshaler interface { + // GetOneForMarshalNullable returns the value at i for json.Marshal. When + // nullable is false, a value is returned even if the array's validity bitmap + // marks it null (a non-nullable field must not serialize as null). + GetOneForMarshalNullable(i int, nullable bool) interface{} +} + // ValueType is a generic constraint for valid Arrow primitive types type ValueType interface { bool | FixedWidthType | string | []byte diff --git a/arrow/array/binary.go b/arrow/array/binary.go index ba9ce287..1801d9d6 100644 --- a/arrow/array/binary.go +++ b/arrow/array/binary.go @@ -152,17 +152,21 @@ func (a *Binary) setData(data *Data) { } } -func (a *Binary) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *Binary) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } return a.Value(i) } +func (a *Binary) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *Binary) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string @@ -346,17 +350,21 @@ func (a *LargeBinary) setData(data *Data) { } } -func (a *LargeBinary) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *LargeBinary) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } return a.Value(i) } +func (a *LargeBinary) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *LargeBinary) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string @@ -522,17 +530,21 @@ func (a *BinaryView) ValueStr(i int) string { return base64.StdEncoding.EncodeToString(a.Value(i)) } -func (a *BinaryView) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *BinaryView) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } return a.Value(i) } +func (a *BinaryView) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *BinaryView) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } // golang marshal standard says that []byte will be marshalled // as a base64-encoded string diff --git a/arrow/array/boolean.go b/arrow/array/boolean.go index c555536f..183d4955 100644 --- a/arrow/array/boolean.go +++ b/arrow/array/boolean.go @@ -90,13 +90,17 @@ func (a *Boolean) setData(data *Data) { } } -func (a *Boolean) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *Boolean) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.Value(i) } return nil } +func (a *Boolean) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *Boolean) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { diff --git a/arrow/array/decimal.go b/arrow/array/decimal.go index 3227f66e..269ca290 100644 --- a/arrow/array/decimal.go +++ b/arrow/array/decimal.go @@ -55,7 +55,7 @@ func (a *baseDecimal[T]) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - return a.GetOneForMarshal(i, true).(string) + return a.GetOneForMarshal(i).(string) } func (a *baseDecimal[T]) Values() []T { return a.values } @@ -89,7 +89,7 @@ func (a *baseDecimal[T]) setData(data *Data) { } } -func (a *baseDecimal[T]) GetOneForMarshal(i int, nullable bool) any { +func (a *baseDecimal[T]) GetOneForMarshalNullable(i int, nullable bool) any { if nullable && a.IsNull(i) { return nil } @@ -99,10 +99,14 @@ func (a *baseDecimal[T]) GetOneForMarshal(i int, nullable bool) any { return n.ToBigFloat(scale).Text('g', int(typ.GetPrecision())) } +func (a *baseDecimal[T]) GetOneForMarshal(i int) any { + return a.GetOneForMarshalNullable(i, true) +} + func (a *baseDecimal[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } diff --git a/arrow/array/decimal128_test.go b/arrow/array/decimal128_test.go index dcb234a6..e642d037 100644 --- a/arrow/array/decimal128_test.go +++ b/arrow/array/decimal128_test.go @@ -279,7 +279,7 @@ func TestDecimal128GetOneForMarshal(t *testing.T) { } for i := range cases { - assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i, true), "unexpected value at index %d", i) + assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i), "unexpected value at index %d", i) } } diff --git a/arrow/array/decimal256_test.go b/arrow/array/decimal256_test.go index 0d6e9f14..b5674253 100644 --- a/arrow/array/decimal256_test.go +++ b/arrow/array/decimal256_test.go @@ -288,6 +288,6 @@ func TestDecimal256GetOneForMarshal(t *testing.T) { } for i := range cases { - assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i, true), "unexpected value at index %d", i) + assert.Equalf(t, cases[i].want, arr.GetOneForMarshal(i), "unexpected value at index %d", i) } } diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index 96613104..982aa2e3 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -286,19 +286,22 @@ func (d *Dictionary) GetValueIndex(i int) int { debug.Assert(false, "unreachable dictionary index") return -1 } - -func (d *Dictionary) GetOneForMarshal(i int, nullable bool) interface{} { +func (d *Dictionary) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && d.IsNull(i) { return nil } vidx := d.GetValueIndex(i) - return d.Dictionary().GetOneForMarshal(vidx, nullable) + return getOneForMarshalNullable(d.Dictionary(), vidx, nullable) +} + +func (d *Dictionary) GetOneForMarshal(i int) interface{} { + return d.GetOneForMarshalNullable(i, true) } func (d *Dictionary) MarshalJSON() ([]byte, error) { vals := make([]any, d.Len()) for i := range d.Len() { - vals[i] = d.GetOneForMarshal(i, true) + vals[i] = d.GetOneForMarshal(i) } return json.Marshal(vals) } diff --git a/arrow/array/encoded.go b/arrow/array/encoded.go index bcfeded7..441bf9de 100644 --- a/arrow/array/encoded.go +++ b/arrow/array/encoded.go @@ -219,13 +219,13 @@ func (r *RunEndEncoded) String() string { buf.WriteByte(',') } - value := r.values.GetOneForMarshal(i, true) + value := r.values.GetOneForMarshal(i) if byts, ok := value.(json.RawMessage); ok { value = string(byts) } var runEnd int - switch e := r.ends.GetOneForMarshal(i, true).(type) { + switch e := r.ends.GetOneForMarshal(i).(type) { case int16: runEnd = int(e) - r.data.offset case int32: @@ -239,9 +239,16 @@ func (r *RunEndEncoded) String() string { buf.WriteByte(']') return buf.String() } +func (r *RunEndEncoded) GetOneForMarshalNullable(i int, nullable bool) interface{} { + // The values child may serialize as JSON null only when both the outer REE + // field and the REE values child (ValueNullable) are nullable; a non-nullable + // outer field must never emit null even if the values child is nullable. + nullable = nullable && r.data.dtype.(*arrow.RunEndEncodedType).ValueNullable + return getOneForMarshalNullable(r.values, r.GetPhysicalIndex(i), nullable) +} -func (r *RunEndEncoded) GetOneForMarshal(i int, nullable bool) interface{} { - return r.values.GetOneForMarshal(r.GetPhysicalIndex(i), nullable) +func (r *RunEndEncoded) GetOneForMarshal(i int) interface{} { + return r.GetOneForMarshalNullable(i, true) } func (r *RunEndEncoded) MarshalJSON() ([]byte, error) { @@ -252,7 +259,7 @@ func (r *RunEndEncoded) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(r.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(r.GetOneForMarshal(i)); err != nil { return nil, err } } diff --git a/arrow/array/extension.go b/arrow/array/extension.go index 48c4f03a..a74f6dee 100644 --- a/arrow/array/extension.go +++ b/arrow/array/extension.go @@ -116,8 +116,13 @@ func (e *ExtensionArrayBase) String() string { return fmt.Sprintf("(%s)%s", e.data.dtype, e.storage) } -func (e *ExtensionArrayBase) GetOneForMarshal(i int, nullable bool) interface{} { - return e.storage.GetOneForMarshal(i, nullable) +// GetOneForMarshal returns the value at i from the underlying storage array. +// ExtensionArrayBase deliberately does not implement arrow.NullableMarshaler: +// doing so would promote a nullable-aware method onto every embedding extension +// array and bypass a concrete type's own GetOneForMarshal override. Field-local +// nullability for plain extension arrays is handled by the marshaling helper. +func (e *ExtensionArrayBase) GetOneForMarshal(i int) interface{} { + return e.storage.GetOneForMarshal(i) } func (e *ExtensionArrayBase) MarshalJSON() ([]byte, error) { diff --git a/arrow/array/fixed_size_list.go b/arrow/array/fixed_size_list.go index d4ddfe8f..07fbe441 100644 --- a/arrow/array/fixed_size_list.go +++ b/arrow/array/fixed_size_list.go @@ -51,7 +51,7 @@ func (a *FixedSizeList) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - return string(a.GetOneForMarshal(i, true).(json.RawMessage)) + return string(a.GetOneForMarshal(i).(json.RawMessage)) } func (a *FixedSizeList) String() string { @@ -124,7 +124,7 @@ func (a *FixedSizeList) Release() { a.values.Release() } -func (a *FixedSizeList) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *FixedSizeList) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -138,6 +138,10 @@ func (a *FixedSizeList) GetOneForMarshal(i int, nullable bool) interface{} { return json.RawMessage(v) } +func (a *FixedSizeList) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *FixedSizeList) MarshalJSON() ([]byte, error) { var buf bytes.Buffer enc := json.NewEncoder(&buf) diff --git a/arrow/array/fixedsize_binary.go b/arrow/array/fixedsize_binary.go index b9cf2bfd..11dc7615 100644 --- a/arrow/array/fixedsize_binary.go +++ b/arrow/array/fixedsize_binary.go @@ -86,7 +86,7 @@ func (a *FixedSizeBinary) setData(data *Data) { } } -func (a *FixedSizeBinary) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *FixedSizeBinary) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -94,6 +94,10 @@ func (a *FixedSizeBinary) GetOneForMarshal(i int, nullable bool) interface{} { return a.Value(i) } +func (a *FixedSizeBinary) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *FixedSizeBinary) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { diff --git a/arrow/array/float16.go b/arrow/array/float16.go index 8536df40..8923eb43 100644 --- a/arrow/array/float16.go +++ b/arrow/array/float16.go @@ -77,13 +77,17 @@ func (a *Float16) setData(data *Data) { } } -func (a *Float16) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *Float16) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.values[i].Float32() } return nil } +func (a *Float16) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *Float16) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i, v := range a.values { diff --git a/arrow/array/interval.go b/arrow/array/interval.go index b2aad56f..076a7a9d 100644 --- a/arrow/array/interval.go +++ b/arrow/array/interval.go @@ -94,13 +94,17 @@ func (a *MonthInterval) setData(data *Data) { } } -func (a *MonthInterval) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *MonthInterval) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.values[i] } return nil } +func (a *MonthInterval) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + // MarshalJSON will create a json array out of a MonthInterval array, // each value will be an object of the form {"months": #} where // # is the numeric value of that index @@ -361,7 +365,7 @@ func (a *DayTimeInterval) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - data, err := json.Marshal(a.GetOneForMarshal(i, true)) + data, err := json.Marshal(a.GetOneForMarshal(i)) if err != nil { panic(err) } @@ -399,13 +403,17 @@ func (a *DayTimeInterval) setData(data *Data) { } } -func (a *DayTimeInterval) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *DayTimeInterval) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.values[i] } return nil } +func (a *DayTimeInterval) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + // MarshalJSON will marshal this array to JSON as an array of objects, // consisting of the form {"days": #, "milliseconds": #} for each element. func (a *DayTimeInterval) MarshalJSON() ([]byte, error) { @@ -663,7 +671,7 @@ func (a *MonthDayNanoInterval) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr } - data, err := json.Marshal(a.GetOneForMarshal(i, true)) + data, err := json.Marshal(a.GetOneForMarshal(i)) if err != nil { panic(err) } @@ -703,13 +711,17 @@ func (a *MonthDayNanoInterval) setData(data *Data) { } } -func (a *MonthDayNanoInterval) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *MonthDayNanoInterval) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.values[i] } return nil } +func (a *MonthDayNanoInterval) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + // MarshalJSON will marshal this array to a JSON array with elements // marshalled to the form {"months": #, "days": #, "nanoseconds": #} func (a *MonthDayNanoInterval) MarshalJSON() ([]byte, error) { diff --git a/arrow/array/json_reader_test.go b/arrow/array/json_reader_test.go index fc7cedea..3d0def65 100644 --- a/arrow/array/json_reader_test.go +++ b/arrow/array/json_reader_test.go @@ -253,7 +253,7 @@ func recordBatchToNDJSON(t *testing.T, rec arrow.RecordBatch) string { defer arr.Release() for pos := range arr.Len() { - s, err := json.Marshal(arr.GetOneForMarshal(pos, true)) + s, err := json.Marshal(arr.GetOneForMarshal(pos)) assert.NoError(t, err) sb.Write(s) sb.WriteByte('\n') diff --git a/arrow/array/list.go b/arrow/array/list.go index a777eab6..7e634824 100644 --- a/arrow/array/list.go +++ b/arrow/array/list.go @@ -61,7 +61,7 @@ func (a *List) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i, true).(json.RawMessage)) + return string(a.GetOneForMarshal(i).(json.RawMessage)) } func (a *List) String() string { @@ -98,7 +98,7 @@ func (a *List) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *List) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *List) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -112,6 +112,10 @@ func (a *List) GetOneForMarshal(i int, nullable bool) interface{} { return json.RawMessage(v) } +func (a *List) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *List) MarshalJSON() ([]byte, error) { var buf bytes.Buffer enc := json.NewEncoder(&buf) @@ -121,7 +125,7 @@ func (a *List) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { return nil, err } } @@ -194,7 +198,7 @@ func (a *LargeList) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i, true).(json.RawMessage)) + return string(a.GetOneForMarshal(i).(json.RawMessage)) } func (a *LargeList) String() string { @@ -231,7 +235,7 @@ func (a *LargeList) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *LargeList) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *LargeList) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -245,6 +249,10 @@ func (a *LargeList) GetOneForMarshal(i int, nullable bool) interface{} { return json.RawMessage(v) } +func (a *LargeList) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *LargeList) MarshalJSON() ([]byte, error) { var buf bytes.Buffer enc := json.NewEncoder(&buf) @@ -254,7 +262,7 @@ func (a *LargeList) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { return nil, err } } @@ -666,7 +674,7 @@ func (a *ListView) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i, true).(json.RawMessage)) + return string(a.GetOneForMarshal(i).(json.RawMessage)) } func (a *ListView) String() string { @@ -707,7 +715,7 @@ func (a *ListView) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *ListView) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *ListView) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -721,6 +729,10 @@ func (a *ListView) GetOneForMarshal(i int, nullable bool) interface{} { return json.RawMessage(v) } +func (a *ListView) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *ListView) MarshalJSON() ([]byte, error) { var buf bytes.Buffer enc := json.NewEncoder(&buf) @@ -730,7 +742,7 @@ func (a *ListView) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { return nil, err } } @@ -814,7 +826,7 @@ func (a *LargeListView) ValueStr(i int) string { if !a.IsValid(i) { return NullValueStr } - return string(a.GetOneForMarshal(i, true).(json.RawMessage)) + return string(a.GetOneForMarshal(i).(json.RawMessage)) } func (a *LargeListView) String() string { @@ -855,7 +867,7 @@ func (a *LargeListView) setData(data *Data) { a.values = MakeFromData(data.childData[0]) } -func (a *LargeListView) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *LargeListView) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -869,6 +881,10 @@ func (a *LargeListView) GetOneForMarshal(i int, nullable bool) interface{} { return json.RawMessage(v) } +func (a *LargeListView) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *LargeListView) MarshalJSON() ([]byte, error) { var buf bytes.Buffer enc := json.NewEncoder(&buf) @@ -878,7 +894,7 @@ func (a *LargeListView) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { return nil, err } } diff --git a/arrow/array/null.go b/arrow/array/null.go index 01001776..bdb4ee22 100644 --- a/arrow/array/null.go +++ b/arrow/array/null.go @@ -80,10 +80,14 @@ func (a *Null) setData(data *Data) { a.data.nulls = a.data.length } -func (a *Null) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *Null) GetOneForMarshalNullable(i int, nullable bool) interface{} { return nil } +func (a *Null) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *Null) MarshalJSON() ([]byte, error) { return json.Marshal(make([]interface{}, a.Len())) } diff --git a/arrow/array/numeric_generic.go b/arrow/array/numeric_generic.go index c7616cf2..5abade6d 100644 --- a/arrow/array/numeric_generic.go +++ b/arrow/array/numeric_generic.go @@ -83,7 +83,7 @@ func (a *numericArray[T]) ValueStr(i int) string { return fmt.Sprintf("%v", a.values[i]) } -func (a *numericArray[T]) GetOneForMarshal(i int, nullable bool) any { +func (a *numericArray[T]) GetOneForMarshalNullable(i int, nullable bool) any { if nullable && a.IsNull(i) { return nil } @@ -91,6 +91,10 @@ func (a *numericArray[T]) GetOneForMarshal(i int, nullable bool) any { return a.values[i] } +func (a *numericArray[T]) GetOneForMarshal(i int) any { + return a.GetOneForMarshalNullable(i, true) +} + func (a *numericArray[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.Len() { @@ -107,7 +111,7 @@ type oneByteArrs[T int8 | uint8] struct { numericArray[T] } -func (a *oneByteArrs[T]) GetOneForMarshal(i int, nullable bool) any { +func (a *oneByteArrs[T]) GetOneForMarshalNullable(i int, nullable bool) any { if nullable && a.IsNull(i) { return nil } @@ -115,6 +119,10 @@ func (a *oneByteArrs[T]) GetOneForMarshal(i int, nullable bool) any { return float64(a.values[i]) // prevent uint8/int8 from being seen as binary data } +func (a *oneByteArrs[T]) GetOneForMarshal(i int) any { + return a.GetOneForMarshalNullable(i, true) +} + func (a *oneByteArrs[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.Len() { @@ -141,7 +149,7 @@ func (a *floatArray[T]) ValueStr(i int) string { return strconv.FormatFloat(float64(a.Value(i)), 'g', -1, bitWidth) } -func (a *floatArray[T]) GetOneForMarshal(i int, nullable bool) any { +func (a *floatArray[T]) GetOneForMarshalNullable(i int, nullable bool) any { if nullable && a.IsNull(i) { return nil } @@ -157,10 +165,14 @@ func (a *floatArray[T]) GetOneForMarshal(i int, nullable bool) any { } } +func (a *floatArray[T]) GetOneForMarshal(i int) any { + return a.GetOneForMarshalNullable(i, true) +} + func (a *floatArray[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } @@ -176,7 +188,7 @@ type dateArray[T interface { func (d *dateArray[T]) MarshalJSON() ([]byte, error) { vals := make([]any, d.Len()) for i := range d.values { - vals[i] = d.GetOneForMarshal(i, true) + vals[i] = d.GetOneForMarshal(i) } return json.Marshal(vals) } @@ -189,7 +201,7 @@ func (d *dateArray[T]) ValueStr(i int) string { return d.values[i].FormattedString() } -func (d *dateArray[T]) GetOneForMarshal(i int, nullable bool) interface{} { +func (d *dateArray[T]) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && d.IsNull(i) { return nil } @@ -197,6 +209,10 @@ func (d *dateArray[T]) GetOneForMarshal(i int, nullable bool) interface{} { return d.values[i].FormattedString() } +func (d *dateArray[T]) GetOneForMarshal(i int) interface{} { + return d.GetOneForMarshalNullable(i, true) +} + type timeType interface { TimeUnit() arrow.TimeUnit } @@ -212,7 +228,7 @@ type timeArray[T interface { func (a *timeArray[T]) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } @@ -225,7 +241,7 @@ func (a *timeArray[T]) ValueStr(i int) string { return a.values[i].FormattedString(a.DataType().(timeType).TimeUnit()) } -func (a *timeArray[T]) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *timeArray[T]) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -233,6 +249,10 @@ func (a *timeArray[T]) GetOneForMarshal(i int, nullable bool) interface{} { return a.values[i].ToTime(a.DataType().(timeType).TimeUnit()).Format("15:04:05.999999999") } +func (a *timeArray[T]) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + type Duration struct { numericArray[arrow.Duration] } @@ -248,7 +268,7 @@ func (a *Duration) DurationValues() []arrow.Duration { return a.Values() } func (a *Duration) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } @@ -261,13 +281,17 @@ func (a *Duration) ValueStr(i int) string { return fmt.Sprintf("%d%s", a.values[i], a.DataType().(timeType).TimeUnit()) } -func (a *Duration) GetOneForMarshal(i int, nullable bool) any { +func (a *Duration) GetOneForMarshalNullable(i int, nullable bool) any { if nullable && a.IsNull(i) { return nil } return fmt.Sprintf("%d%s", a.values[i], a.DataType().(timeType).TimeUnit()) } +func (a *Duration) GetOneForMarshal(i int) any { + return a.GetOneForMarshalNullable(i, true) +} + type Int64 struct { numericArray[int64] } diff --git a/arrow/array/string.go b/arrow/array/string.go index 0133f7a9..5cc9fb78 100644 --- a/arrow/array/string.go +++ b/arrow/array/string.go @@ -154,13 +154,17 @@ func (a *String) setData(data *Data) { } } -func (a *String) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *String) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.Value(i) } return nil } +func (a *String) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *String) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { @@ -362,17 +366,21 @@ func (a *LargeString) setData(data *Data) { } } -func (a *LargeString) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *LargeString) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.Value(i) } return nil } +func (a *LargeString) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *LargeString) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } @@ -526,17 +534,21 @@ func (a *StringView) ValueStr(i int) string { return a.Value(i) } -func (a *StringView) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *StringView) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } return a.Value(i) } +func (a *StringView) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *StringView) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := 0; i < a.Len(); i++ { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } diff --git a/arrow/array/struct.go b/arrow/array/struct.go index 17f6152a..5579158d 100644 --- a/arrow/array/struct.go +++ b/arrow/array/struct.go @@ -130,7 +130,7 @@ func (a *Struct) ValueStr(i int) string { return NullValueStr } - data, err := json.Marshal(a.GetOneForMarshal(i, true)) + data, err := json.Marshal(a.GetOneForMarshal(i)) if err != nil { panic(err) } @@ -209,7 +209,7 @@ func (a *Struct) setData(data *Data) { } } -func (a *Struct) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *Struct) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } @@ -218,11 +218,15 @@ func (a *Struct) GetOneForMarshal(i int, nullable bool) interface{} { dtype := a.data.dtype.(*arrow.StructType) fieldList := dtype.Fields() for j, d := range a.fields { - tmp[fieldList[j].Name] = d.GetOneForMarshal(i, dtype.Field(j).Nullable) + tmp[fieldList[j].Name] = getOneForMarshalNullable(d, i, dtype.Field(j).Nullable) } return tmp } +func (a *Struct) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *Struct) MarshalJSON() ([]byte, error) { var buf bytes.Buffer enc := json.NewEncoder(&buf) @@ -232,7 +236,7 @@ func (a *Struct) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { return nil, err } } diff --git a/arrow/array/timestamp.go b/arrow/array/timestamp.go index 613941f7..e2897fac 100644 --- a/arrow/array/timestamp.go +++ b/arrow/array/timestamp.go @@ -109,17 +109,21 @@ func (a *Timestamp) ValueStr(i int) string { return toTime(a.values[i]).Format(layout) } -func (a *Timestamp) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *Timestamp) GetOneForMarshalNullable(i int, nullable bool) interface{} { if val := a.ValueStr(i); !nullable || val != NullValueStr { return val } return nil } +func (a *Timestamp) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + func (a *Timestamp) MarshalJSON() ([]byte, error) { vals := make([]interface{}, a.Len()) for i := range a.values { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) diff --git a/arrow/array/union.go b/arrow/array/union.go index c10e9964..8e94900c 100644 --- a/arrow/array/union.go +++ b/arrow/array/union.go @@ -320,7 +320,7 @@ func (a *SparseUnion) setData(data *Data) { debug.Assert(a.data.buffers[0] == nil, "arrow/array: validity bitmap for sparse unions should be nil") } -func (a *SparseUnion) GetOneForMarshal(i int, nullable bool) any { +func (a *SparseUnion) GetOneForMarshalNullable(i int, nullable bool) any { typeID := a.RawTypeCodes()[i] childID := a.ChildID(i) @@ -331,7 +331,11 @@ func (a *SparseUnion) GetOneForMarshal(i int, nullable bool) any { return []any{typeID, nil} } - return []any{typeID, data.GetOneForMarshal(i, childNullable)} + return []any{typeID, getOneForMarshalNullable(data, i, childNullable)} +} + +func (a *SparseUnion) GetOneForMarshal(i int) any { + return a.GetOneForMarshalNullable(i, true) } func (a *SparseUnion) MarshalJSON() ([]byte, error) { @@ -343,7 +347,7 @@ func (a *SparseUnion) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { return nil, err } } @@ -356,7 +360,7 @@ func (a *SparseUnion) ValueStr(i int) string { return NullValueStr } - val := a.GetOneForMarshal(i, true) + val := a.GetOneForMarshal(i) if val == nil { // child is nil return NullValueStr @@ -381,7 +385,7 @@ func (a *SparseUnion) String() string { field := fieldList[a.ChildID(i)] f := a.Field(a.ChildID(i)) - fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(i, true)) + fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(i)) } b.WriteByte(']') return b.String() @@ -615,7 +619,7 @@ func (a *DenseUnion) setData(data *Data) { } } -func (a *DenseUnion) GetOneForMarshal(i int, nullable bool) any { +func (a *DenseUnion) GetOneForMarshalNullable(i int, nullable bool) any { typeID := a.RawTypeCodes()[i] childID := a.ChildID(i) @@ -627,7 +631,11 @@ func (a *DenseUnion) GetOneForMarshal(i int, nullable bool) any { return []any{typeID, nil} } - return []any{typeID, data.GetOneForMarshal(offset, childNullable)} + return []any{typeID, getOneForMarshalNullable(data, offset, childNullable)} +} + +func (a *DenseUnion) GetOneForMarshal(i int) any { + return a.GetOneForMarshalNullable(i, true) } func (a *DenseUnion) MarshalJSON() ([]byte, error) { @@ -639,7 +647,7 @@ func (a *DenseUnion) MarshalJSON() ([]byte, error) { if i != 0 { buf.WriteByte(',') } - if err := enc.Encode(a.GetOneForMarshal(i, true)); err != nil { + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { return nil, err } } @@ -652,7 +660,7 @@ func (a *DenseUnion) ValueStr(i int) string { return NullValueStr } - val := a.GetOneForMarshal(i, true) + val := a.GetOneForMarshal(i) if val == nil { // child in nil return NullValueStr @@ -679,7 +687,7 @@ func (a *DenseUnion) String() string { field := fieldList[a.ChildID(i)] f := a.Field(a.ChildID(i)) - fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(int(offsets[i]), true)) + fmt.Fprintf(&b, "{%s=%v}", field.Name, f.GetOneForMarshal(int(offsets[i]))) } b.WriteByte(']') return b.String() diff --git a/arrow/array/util.go b/arrow/array/util.go index b3375152..0461e171 100644 --- a/arrow/array/util.go +++ b/arrow/array/util.go @@ -284,7 +284,7 @@ func RecordToJSON(rec arrow.RecordBatch, w io.Writer) error { cols := make(map[string]interface{}) for i := 0; int64(i) < rec.NumRows(); i++ { for j, c := range rec.Columns() { - cols[fields[j].Name] = c.GetOneForMarshal(i, rec.Schema().Field(j).Nullable) + cols[fields[j].Name] = getOneForMarshalNullable(c, i, rec.Schema().Field(j).Nullable) } if err := enc.Encode(cols); err != nil { return err @@ -293,6 +293,34 @@ func RecordToJSON(rec arrow.RecordBatch, w io.Writer) error { return nil } +// getOneForMarshalNullable dispatches to an Array's field-local-nullability-aware +// marshaler when it implements arrow.NullableMarshaler, otherwise it falls back to +// the plain GetOneForMarshal (which decides null purely from the validity bitmap). +// This lets containers propagate a field's Nullable flag without every Array having +// to implement the optional interface. +func getOneForMarshalNullable(a arrow.Array, i int, nullable bool) interface{} { + if nm, ok := a.(arrow.NullableMarshaler); ok { + return nm.GetOneForMarshalNullable(i, nullable) + } + + // ExtensionArrayBase intentionally does not implement arrow.NullableMarshaler + // (see extension.go), so extension arrays reach here. Preserve the extension's + // own logical JSON by default; the only field-local nullability we can safely + // add is that a null slot in a non-nullable field must not serialize as null. + // Ask the extension first (honoring any custom GetOneForMarshal override); only + // if it would emit null do we fall back to the storage value. Plain wrapper + // extensions (e.g. Parametric*Array) return nil at a null slot and thus round + // trip through storage, without bypassing a custom representation. + if ext, ok := a.(ExtensionArray); ok && !nullable && a.IsNull(i) { + if v := a.GetOneForMarshal(i); v != nil { + return v + } + return getOneForMarshalNullable(ext.Storage(), i, false) + } + + return a.GetOneForMarshal(i) +} + func TableFromJSON(mem memory.Allocator, sc *arrow.Schema, recJSON []string, opt ...FromJSONOption) (arrow.Table, error) { batches := make([]arrow.RecordBatch, len(recJSON)) for i, batchJSON := range recJSON { diff --git a/arrow/extensions/bool8.go b/arrow/extensions/bool8.go index aaf51f0c..8f784a04 100644 --- a/arrow/extensions/bool8.go +++ b/arrow/extensions/bool8.go @@ -114,13 +114,17 @@ func (a *Bool8Array) MarshalJSON() ([]byte, error) { return json.Marshal(values) } -func (a *Bool8Array) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *Bool8Array) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } return a.Value(i) } +func (a *Bool8Array) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + // boolToInt8 performs the simple scalar conversion of bool to the canonical int8 // value for the Bool8Type. func boolToInt8(v bool) int8 { diff --git a/arrow/extensions/json.go b/arrow/extensions/json.go index b9cc50a7..4b7194f9 100644 --- a/arrow/extensions/json.go +++ b/arrow/extensions/json.go @@ -118,7 +118,7 @@ func (a *JSONArray) ValueBytes(i int) []byte { func (a *JSONArray) valueJSON(i int, nullable bool) json.RawMessage { var val json.RawMessage - if a.IsValid(i) { + if !nullable || a.IsValid(i) { val = json.RawMessage(a.Storage().(array.StringLike).Value(i)) } return val @@ -142,10 +142,14 @@ func (a *JSONArray) MarshalJSON() ([]byte, error) { } // GetOneForMarshal implements arrow.Array. -func (a *JSONArray) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *JSONArray) GetOneForMarshalNullable(i int, nullable bool) interface{} { return a.valueJSON(i, nullable) } +func (a *JSONArray) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + var ( _ arrow.ExtensionType = (*JSONType)(nil) _ array.ExtensionArray = (*JSONArray)(nil) diff --git a/arrow/extensions/timestamp_with_offset.go b/arrow/extensions/timestamp_with_offset.go index 0f85a20a..e87fa12d 100644 --- a/arrow/extensions/timestamp_with_offset.go +++ b/arrow/extensions/timestamp_with_offset.go @@ -375,13 +375,17 @@ func (a *TimestampWithOffsetArray) MarshalJSON() ([]byte, error) { return json.Marshal(values) } -func (a *TimestampWithOffsetArray) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *TimestampWithOffsetArray) GetOneForMarshalNullable(i int, nullable bool) interface{} { if nullable && a.IsNull(i) { return nil } return a.Value(i) } +func (a *TimestampWithOffsetArray) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + // noLastOffset is the sentinel value for TimestampWithOffsetBuilder.lastOffset // indicating that no run-end-encoded run has been started yet. It is deliberately // outside the range of valid timezone offsets in minutes (roughly [-720, 840]) so diff --git a/arrow/extensions/uuid.go b/arrow/extensions/uuid.go index cbe56ecd..cbe4d7c2 100644 --- a/arrow/extensions/uuid.go +++ b/arrow/extensions/uuid.go @@ -190,18 +190,22 @@ func (a *UUIDArray) ValueStr(i int) string { func (a *UUIDArray) MarshalJSON() ([]byte, error) { vals := make([]any, a.Len()) for i := range vals { - vals[i] = a.GetOneForMarshal(i, true) + vals[i] = a.GetOneForMarshal(i) } return json.Marshal(vals) } -func (a *UUIDArray) GetOneForMarshal(i int, nullable bool) interface{} { +func (a *UUIDArray) GetOneForMarshalNullable(i int, nullable bool) interface{} { if !nullable || a.IsValid(i) { return a.Value(i) } return nil } +func (a *UUIDArray) GetOneForMarshal(i int) interface{} { + return a.GetOneForMarshalNullable(i, true) +} + // UUIDType is a simple extension type that represents a FixedSizeBinary(16) // to be used for representing UUIDs type UUIDType struct { diff --git a/arrow/extensions/variant.go b/arrow/extensions/variant.go index 1f41ef07..e2123485 100644 --- a/arrow/extensions/variant.go +++ b/arrow/extensions/variant.go @@ -599,7 +599,7 @@ func (v *VariantArray) MarshalJSON() ([]byte, error) { return json.Marshal(values) } -func (v *VariantArray) GetOneForMarshal(i int, nullable bool) any { +func (v *VariantArray) GetOneForMarshalNullable(i int, nullable bool) any { if nullable && v.IsNull(i) { return nil } @@ -612,6 +612,10 @@ func (v *VariantArray) GetOneForMarshal(i int, nullable bool) any { return val.Value() } +func (v *VariantArray) GetOneForMarshal(i int) any { + return v.GetOneForMarshalNullable(i, true) +} + type variantReader interface { IsNull(i int) bool Value(i int) (variant.Value, error)