From 0f737cb32cff40ce82f710dc6665b8d293be57c0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 Mar 2024 09:54:03 -0700 Subject: [PATCH 1/7] code compiles --- Cargo.lock | 130 ++++++++++++++++++++++++++++------------ Cargo.toml | 14 ++--- src/common/data_type.rs | 12 +++- src/expr.rs | 126 ++++++++++++++++++++++++-------------- src/functions.rs | 19 ------ src/udaf.rs | 4 +- 6 files changed, 191 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e6aa6050a..037035fe8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -107,8 +107,8 @@ dependencies = [ "serde", "serde_json", "snap", - "strum", - "strum_macros", + "strum 0.25.0", + "strum_macros 0.25.3", "thiserror", "typed-builder", "uuid", @@ -544,15 +544,15 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "5bc015644b92d5890fab7489e49d21f879d5c990186827d42ec511919404f38b" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", "serde", - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] @@ -583,8 +583,8 @@ version = "7.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c64043d6c7b7a4c58e39e7efccfdea7b93d885a795d0c054a69dbbf4dd52686" dependencies = [ - "strum", - "strum_macros", + "strum 0.25.0", + "strum_macros 0.25.3", "unicode-width", ] @@ -715,9 +715,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4328f5467f76d890fe3f924362dbc3a838c6a733f762b32d87f9e0b7bef5fb49" +checksum = "b2b360b692bf6c6d6e6b6dbaf41a3be0020daeceac0f406aed54c75331e50dbb" dependencies = [ "ahash", "apache-avro", @@ -734,6 +734,8 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-functions", + "datafusion-functions-array", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-plan", @@ -765,9 +767,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29a7752143b446db4a2cccd9a6517293c6b97e8c39e520ca43ccd07135a4f7e" +checksum = "37f343ccc298f440e25aa38ff82678291a7acc24061c7370ba6c0ff5cc811412" dependencies = [ "ahash", "apache-avro", @@ -787,9 +789,9 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d447650af16e138c31237f53ddaef6dd4f92f0e2d3f2f35d190e16c214ca496" +checksum = "3f9c93043081487e335399a21ebf8295626367a647ac5cb87d41d18afad7d0f7" dependencies = [ "arrow", "chrono", @@ -808,9 +810,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8d19598e48a498850fb79f97a9719b1f95e7deb64a7a06f93f313e8fa1d524b" +checksum = "e204d89909e678846b6a95f156aafc1ee5b36cb6c9e37ec2e1449b078a38c818" dependencies = [ "ahash", "arrow", @@ -818,15 +820,44 @@ dependencies = [ "datafusion-common", "paste", "sqlparser", - "strum", - "strum_macros", + "strum 0.26.1", + "strum_macros 0.26.1", +] + +[[package]] +name = "datafusion-functions" +version = "36.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98f1c73f7801b2b8ba2297b3ad78ffcf6c1fc6b8171f502987eb9ad5cb244ee7" +dependencies = [ + "arrow", + "base64", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "hex", + "log", +] + +[[package]] +name = "datafusion-functions-array" +version = "36.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d16a0ddf2c991526f6ffe2f47a72c6da0b7354d6c32411dd20631fe2e38937" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "log", + "paste", ] [[package]] name = "datafusion-optimizer" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b7feb0391f1fc75575acb95b74bfd276903dc37a5409fcebe160bc7ddff2010" +checksum = "5ae27e07bf1f04d327be5c2a293470879801ab5535204dc3b16b062fda195496" dependencies = [ "arrow", "async-trait", @@ -842,9 +873,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e911bca609c89a54e8f014777449d8290327414d3e10c57a3e3c2122e38878d0" +checksum = "dde620cd9ef76a3bca9c754fb68854bd2349c49f55baf97e08001f9e967f6d6b" dependencies = [ "ahash", "arrow", @@ -852,11 +883,13 @@ dependencies = [ "arrow-buffer", "arrow-ord", "arrow-schema", + "arrow-string", "base64", "blake2", "blake3", "chrono", "datafusion-common", + "datafusion-execution", "datafusion-expr", "half", "hashbrown 0.14.3", @@ -876,9 +909,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96b546b8a02e9c2ab35ac6420d511f12a4701950c1eb2e568c122b4fefb0be3" +checksum = "9a4c75fba9ea99d64b2246cbd2fcae2e6fc973e6616b1015237a616036506dd4" dependencies = [ "ahash", "arrow", @@ -907,7 +940,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "35.0.0" +version = "36.0.0" dependencies = [ "async-trait", "datafusion", @@ -934,9 +967,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d18d36f260bbbd63aafdb55339213a23d540d3419810575850ef0a798a6b768" +checksum = "21474a95c3a62d113599d21b439fa15091b538bac06bd20be0bb2e7d22903c09" dependencies = [ "arrow", "arrow-schema", @@ -948,9 +981,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "35.0.0" +version = "36.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad6bef77af3d8a697ae63ffbcb5aa66b74cd08ea93a31e2e757da75b2f1452f" +checksum = "aab89c01ef66a59ec92d2360db63893224b4f7e085e2ee6351e0bb77f88931f0" dependencies = [ "async-recursion", "chrono", @@ -1181,9 +1214,9 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git2" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbf97ba92db08df386e10c8ede66a2a0369bd277090afd8710e19e38de9ec0cd" +checksum = "1b3ba52851e73b46a4c3df1d89343741112003f0f6f13beb0dfac9e457c3fdcd" dependencies = [ "bitflags 2.4.2", "libc", @@ -1557,9 +1590,9 @@ dependencies = [ [[package]] name = "libgit2-sys" -version = "0.16.1+1.7.1" +version = "0.16.2+1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2a2bb3680b094add03bb3732ec520ece34da31a8cd2d633d1389d0f0fb60d0c" +checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" dependencies = [ "cc", "libc", @@ -2662,9 +2695,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.41.0" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cc2c25a6c66789625ef164b4c7d2e548d627902280c13710d33da8222169964" +checksum = "f95c4bae5aba7cd30bd506f7140026ade63cff5afd778af8854026f9606bf5d4" dependencies = [ "log", "sqlparser_derive", @@ -2692,8 +2725,14 @@ name = "strum" version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" + +[[package]] +name = "strum" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "723b93e8addf9aa965ebe2d11da6d7540fa2283fcea14b3371ff055f7ba13f5f" dependencies = [ - "strum_macros", + "strum_macros 0.26.1", ] [[package]] @@ -2709,11 +2748,24 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "strum_macros" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a3417fc93d76740d974a01654a09777cb500428cc874ca9f45edfe0c4d4cd18" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.48", +] + [[package]] name = "substrait" -version = "0.22.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5478fbd0313a9b0915a1c0e7ebf15b5fed7d7c6dd7229b4f5e32ce75b10f256a" +checksum = "2c8ffb7a3e7505bb835513e77ebfe67d359e57d684a5972323e3bdefbecc1f25" dependencies = [ "git2", "heck", @@ -2857,9 +2909,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.35.1" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 6c47484b3..ce8cb06d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "35.0.0" +version = "36.0.0" homepage = "https://github.com/apache/arrow-datafusion-python" repository = "https://github.com/apache/arrow-datafusion-python" authors = ["Apache Arrow "] @@ -37,12 +37,12 @@ substrait = ["dep:datafusion-substrait"] tokio = { version = "1.35", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.8" pyo3 = { version = "0.20", features = ["extension-module", "abi3", "abi3-py38"] } -datafusion = { version = "35.0.0", features = ["pyarrow", "avro"] } -datafusion-common = { version = "35.0.0", features = ["pyarrow"] } -datafusion-expr = "35.0.0" -datafusion-optimizer = "35.0.0" -datafusion-sql = "35.0.0" -datafusion-substrait = { version = "35.0.0", optional = true } +datafusion = { version = "36.0.0", features = ["pyarrow", "avro"] } +datafusion-common = { version = "36.0.0", features = ["pyarrow"] } +datafusion-expr = "36.0.0" +datafusion-optimizer = "36.0.0" +datafusion-sql = "36.0.0" +datafusion-substrait = { version = "36.0.0", optional = true } prost = "0.12" prost-types = "0.12" uuid = { version = "1.3", features = ["v4"] } diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 6059768e3..d3203fdcd 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -290,15 +290,21 @@ impl DataTypeMap { Ok(DataType::Interval(IntervalUnit::MonthDayNano)) } ScalarValue::List(arr) => Ok(arr.data_type().to_owned()), - ScalarValue::Struct(_, fields) => Ok(DataType::Struct(fields.to_owned())), + ScalarValue::Struct(_fields) => Err(py_datafusion_err( + DataFusionError::NotImplemented("ScalarValue::Struct".to_string()), + )), ScalarValue::FixedSizeBinary(size, _) => Ok(DataType::FixedSizeBinary(*size)), ScalarValue::FixedSizeList(_array_ref) => { // The FieldRef was removed from ScalarValue::FixedSizeList in // https://github.com/apache/arrow-datafusion/pull/8221, so we can no // longer convert back to a DataType here - todo!() + Err(py_datafusion_err(DataFusionError::NotImplemented( + "ScalarValue::FixedSizeList".to_string(), + ))) } - ScalarValue::LargeList(_) => todo!(), + ScalarValue::LargeList(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( + "ScalarValue::LargeList".to_string(), + ))), ScalarValue::DurationSecond(_) => Ok(DataType::Duration(TimeUnit::Second)), ScalarValue::DurationMillisecond(_) => Ok(DataType::Duration(TimeUnit::Millisecond)), ScalarValue::DurationMicrosecond(_) => Ok(DataType::Duration(TimeUnit::Microsecond)), diff --git a/src/expr.rs b/src/expr.rs index dbb56ee99..5224aa246 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -32,7 +32,7 @@ use datafusion_expr::{ }; use crate::common::data_type::{DataTypeMap, RexType}; -use crate::errors::{py_runtime_err, py_type_err, DataFusionError}; +use crate::errors::{py_datafusion_err, py_runtime_err, py_type_err, DataFusionError}; use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; use crate::expr::column::PyColumn; @@ -294,6 +294,7 @@ impl PyExpr { | Expr::OuterReferenceColumn(_, _) | Expr::IsNotUnknown(_) => RexType::Call, Expr::ScalarSubquery(..) => RexType::ScalarSubquery, + Expr::Unnest(_) => todo!(), }) } @@ -306,49 +307,81 @@ impl PyExpr { /// Extracts the Expr value into a PyObject that can be shared with Python pub fn python_value(&self, py: Python) -> PyResult { match &self.expr { - Expr::Literal(scalar_value) => Ok(match scalar_value { - ScalarValue::Null => todo!(), - ScalarValue::Boolean(v) => v.into_py(py), - ScalarValue::Float32(v) => v.into_py(py), - ScalarValue::Float64(v) => v.into_py(py), - ScalarValue::Decimal128(v, _, _) => v.into_py(py), - ScalarValue::Decimal256(_, _, _) => todo!(), - ScalarValue::Int8(v) => v.into_py(py), - ScalarValue::Int16(v) => v.into_py(py), - ScalarValue::Int32(v) => v.into_py(py), - ScalarValue::Int64(v) => v.into_py(py), - ScalarValue::UInt8(v) => v.into_py(py), - ScalarValue::UInt16(v) => v.into_py(py), - ScalarValue::UInt32(v) => v.into_py(py), - ScalarValue::UInt64(v) => v.into_py(py), - ScalarValue::Utf8(v) => v.clone().into_py(py), - ScalarValue::LargeUtf8(v) => v.clone().into_py(py), - ScalarValue::Binary(v) => v.clone().into_py(py), - ScalarValue::FixedSizeBinary(_, _) => todo!(), - ScalarValue::LargeBinary(v) => v.clone().into_py(py), - ScalarValue::List(_) => todo!(), - ScalarValue::Date32(v) => v.into_py(py), - ScalarValue::Date64(v) => v.into_py(py), - ScalarValue::Time32Second(v) => v.into_py(py), - ScalarValue::Time32Millisecond(v) => v.into_py(py), - ScalarValue::Time64Microsecond(v) => v.into_py(py), - ScalarValue::Time64Nanosecond(v) => v.into_py(py), - ScalarValue::TimestampSecond(v, _) => v.into_py(py), - ScalarValue::TimestampMillisecond(v, _) => v.into_py(py), - ScalarValue::TimestampMicrosecond(v, _) => v.into_py(py), - ScalarValue::TimestampNanosecond(v, _) => v.into_py(py), - ScalarValue::IntervalYearMonth(v) => v.into_py(py), - ScalarValue::IntervalDayTime(v) => v.into_py(py), - ScalarValue::IntervalMonthDayNano(v) => v.into_py(py), - ScalarValue::DurationSecond(v) => v.into_py(py), - ScalarValue::DurationMicrosecond(v) => v.into_py(py), - ScalarValue::DurationNanosecond(v) => v.into_py(py), - ScalarValue::DurationMillisecond(v) => v.into_py(py), - ScalarValue::Struct(_, _) => todo!(), - ScalarValue::Dictionary(_, _) => todo!(), - ScalarValue::FixedSizeList(_) => todo!(), - ScalarValue::LargeList(_) => todo!(), - }), + Expr::Literal(scalar_value) => match scalar_value { + ScalarValue::Null => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::Null".to_string(), + ), + )), + ScalarValue::Boolean(v) => Ok(v.into_py(py)), + ScalarValue::Float32(v) => Ok(v.into_py(py)), + ScalarValue::Float64(v) => Ok(v.into_py(py)), + ScalarValue::Decimal128(v, _, _) => Ok(v.into_py(py)), + ScalarValue::Decimal256(_, _, _) => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::Decimal256".to_string(), + ), + )), + ScalarValue::Int8(v) => Ok(v.into_py(py)), + ScalarValue::Int16(v) => Ok(v.into_py(py)), + ScalarValue::Int32(v) => Ok(v.into_py(py)), + ScalarValue::Int64(v) => Ok(v.into_py(py)), + ScalarValue::UInt8(v) => Ok(v.into_py(py)), + ScalarValue::UInt16(v) => Ok(v.into_py(py)), + ScalarValue::UInt32(v) => Ok(v.into_py(py)), + ScalarValue::UInt64(v) => Ok(v.into_py(py)), + ScalarValue::Utf8(v) => Ok(v.clone().into_py(py)), + ScalarValue::LargeUtf8(v) => Ok(v.clone().into_py(py)), + ScalarValue::Binary(v) => Ok(v.clone().into_py(py)), + ScalarValue::FixedSizeBinary(_, _) => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::FixedSizeBinary".to_string(), + ), + )), + ScalarValue::LargeBinary(v) => Ok(v.clone().into_py(py)), + ScalarValue::List(_) => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::List".to_string(), + ), + )), + ScalarValue::Date32(v) => Ok(v.into_py(py)), + ScalarValue::Date64(v) => Ok(v.into_py(py)), + ScalarValue::Time32Second(v) => Ok(v.into_py(py)), + ScalarValue::Time32Millisecond(v) => Ok(v.into_py(py)), + ScalarValue::Time64Microsecond(v) => Ok(v.into_py(py)), + ScalarValue::Time64Nanosecond(v) => Ok(v.into_py(py)), + ScalarValue::TimestampSecond(v, _) => Ok(v.into_py(py)), + ScalarValue::TimestampMillisecond(v, _) => Ok(v.into_py(py)), + ScalarValue::TimestampMicrosecond(v, _) => Ok(v.into_py(py)), + ScalarValue::TimestampNanosecond(v, _) => Ok(v.into_py(py)), + ScalarValue::IntervalYearMonth(v) => Ok(v.into_py(py)), + ScalarValue::IntervalDayTime(v) => Ok(v.into_py(py)), + ScalarValue::IntervalMonthDayNano(v) => Ok(v.into_py(py)), + ScalarValue::DurationSecond(v) => Ok(v.into_py(py)), + ScalarValue::DurationMicrosecond(v) => Ok(v.into_py(py)), + ScalarValue::DurationNanosecond(v) => Ok(v.into_py(py)), + ScalarValue::DurationMillisecond(v) => Ok(v.into_py(py)), + ScalarValue::Struct(_) => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::Struct".to_string(), + ), + )), + ScalarValue::Dictionary(_, _) => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::Dictionary".to_string(), + ), + )), + ScalarValue::FixedSizeList(_) => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::FixedSizeList".to_string(), + ), + )), + ScalarValue::LargeList(_) => Err(py_datafusion_err( + datafusion_common::DataFusionError::NotImplemented( + "ScalarValue::LargeList".to_string(), + ), + )), + }, _ => Err(py_type_err(format!( "Non Expr::Literal encountered in types: {:?}", &self.expr @@ -455,6 +488,7 @@ impl PyExpr { // Currently un-support/implemented Expr types for Rex Call operations Expr::GroupingSet(..) + | Expr::Unnest(_) | Expr::OuterReferenceColumn(_, _) | Expr::Wildcard { .. } | Expr::ScalarSubquery(..) @@ -585,6 +619,10 @@ impl PyExpr { | Operator::BitwiseAnd | Operator::BitwiseOr => DataTypeMap::map_from_arrow_type(&DataType::Binary), Operator::AtArrow | Operator::ArrowAt => todo!(), + Operator::LikeMatch + | Operator::ILikeMatch + | Operator::NotLikeMatch + | Operator::NotILikeMatch => todo!(), }, Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value) => DataTypeMap::map_from_scalar_value(scalar_value), diff --git a/src/functions.rs b/src/functions.rs index 5a558de0e..a6c4e23a5 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -252,7 +252,6 @@ scalar_function!(factorial, Factorial); scalar_function!(floor, Floor); scalar_function!(gcd, Gcd); scalar_function!(initcap, InitCap, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); -scalar_function!(isnan, Isnan); scalar_function!(iszero, Iszero); scalar_function!(lcm, Lcm); scalar_function!(left, Left, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); @@ -348,15 +347,11 @@ scalar_function!(trunc, Trunc); scalar_function!(upper, Upper, "Converts the string to all upper case."); scalar_function!(make_array, MakeArray); scalar_function!(array, MakeArray); -scalar_function!(nullif, NullIf); scalar_function!(uuid, Uuid); scalar_function!(r#struct, Struct); // Use raw identifier since struct is a keyword scalar_function!(from_unixtime, FromUnixtime); scalar_function!(arrow_typeof, ArrowTypeof); scalar_function!(random, Random); -//Binary String Functions -scalar_function!(encode, Encode); -scalar_function!(decode, Decode); // Array Functions scalar_function!(array_append, ArrayAppend); @@ -382,10 +377,6 @@ scalar_function!(list_position, ArrayPosition); scalar_function!(list_indexof, ArrayPosition); scalar_function!(array_positions, ArrayPositions); scalar_function!(list_positions, ArrayPositions); -scalar_function!(array_to_string, ArrayToString); -scalar_function!(array_join, ArrayToString); -scalar_function!(list_to_string, ArrayToString); -scalar_function!(list_join, ArrayToString); scalar_function!(array_ndims, ArrayNdims); scalar_function!(list_ndims, ArrayNdims); scalar_function!(array_prepend, ArrayPrepend); @@ -510,7 +501,6 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(grouping))?; m.add_wrapped(wrap_pyfunction!(in_list))?; m.add_wrapped(wrap_pyfunction!(initcap))?; - m.add_wrapped(wrap_pyfunction!(isnan))?; m.add_wrapped(wrap_pyfunction!(iszero))?; m.add_wrapped(wrap_pyfunction!(lcm))?; m.add_wrapped(wrap_pyfunction!(left))?; @@ -530,7 +520,6 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(min))?; m.add_wrapped(wrap_pyfunction!(nanvl))?; m.add_wrapped(wrap_pyfunction!(now))?; - m.add_wrapped(wrap_pyfunction!(nullif))?; m.add_wrapped(wrap_pyfunction!(octet_length))?; m.add_wrapped(wrap_pyfunction!(order_by))?; m.add_wrapped(wrap_pyfunction!(pi))?; @@ -597,10 +586,6 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(bool_and))?; m.add_wrapped(wrap_pyfunction!(bool_or))?; - //Binary String Functions - m.add_wrapped(wrap_pyfunction!(encode))?; - m.add_wrapped(wrap_pyfunction!(decode))?; - // Array Functions m.add_wrapped(wrap_pyfunction!(array_append))?; m.add_wrapped(wrap_pyfunction!(array_push_back))?; @@ -625,10 +610,6 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(list_indexof))?; m.add_wrapped(wrap_pyfunction!(array_positions))?; m.add_wrapped(wrap_pyfunction!(list_positions))?; - m.add_wrapped(wrap_pyfunction!(array_to_string))?; - m.add_wrapped(wrap_pyfunction!(array_join))?; - m.add_wrapped(wrap_pyfunction!(list_to_string))?; - m.add_wrapped(wrap_pyfunction!(list_join))?; m.add_wrapped(wrap_pyfunction!(array_ndims))?; m.add_wrapped(wrap_pyfunction!(list_ndims))?; m.add_wrapped(wrap_pyfunction!(array_prepend))?; diff --git a/src/udaf.rs b/src/udaf.rs index 0e7a8deab..9aea761cd 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -41,12 +41,12 @@ impl RustAccumulator { } impl Accumulator for RustAccumulator { - fn state(&self) -> Result> { + fn state(&mut self) -> Result> { Python::with_gil(|py| self.accum.as_ref(py).call_method0("state")?.extract()) .map_err(|e| DataFusionError::Execution(format!("{e}"))) } - fn evaluate(&self) -> Result { + fn evaluate(&mut self) -> Result { Python::with_gil(|py| self.accum.as_ref(py).call_method0("evaluate")?.extract()) .map_err(|e| DataFusionError::Execution(format!("{e}"))) } From a624abeb91718f0e2e6262f636b274ddfd7fdca2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 Mar 2024 12:25:19 -0700 Subject: [PATCH 2/7] add udf scalar functions --- Cargo.lock | 1 + Cargo.toml | 1 + src/functions.rs | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 037035fe8..31e7bad01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -946,6 +946,7 @@ dependencies = [ "datafusion", "datafusion-common", "datafusion-expr", + "datafusion-functions-array", "datafusion-optimizer", "datafusion-sql", "datafusion-substrait", diff --git a/Cargo.toml b/Cargo.toml index ce8cb06d1..f8c6bf9f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ pyo3 = { version = "0.20", features = ["extension-module", "abi3", "abi3-py38"] datafusion = { version = "36.0.0", features = ["pyarrow", "avro"] } datafusion-common = { version = "36.0.0", features = ["pyarrow"] } datafusion-expr = "36.0.0" +datafusion-functions-array = "36.0.0" datafusion-optimizer = "36.0.0" datafusion-sql = "36.0.0" datafusion-substrait = { version = "36.0.0", optional = true } diff --git a/src/functions.rs b/src/functions.rs index a6c4e23a5..107433d6c 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,6 +16,7 @@ // under the License. use pyo3::{prelude::*, wrap_pyfunction}; +use std::sync::Arc; use crate::context::PySessionContext; use crate::errors::DataFusionError; @@ -23,6 +24,7 @@ use crate::expr::conditional_expr::PyCaseBuilder; use crate::expr::window::PyWindowFrame; use crate::expr::PyExpr; use datafusion::execution::FunctionRegistry; +use datafusion::functions; use datafusion_common::{Column, TableReference}; use datafusion_expr::expr::Alias; use datafusion_expr::{ @@ -33,6 +35,50 @@ use datafusion_expr::{ }, lit, BuiltinScalarFunction, Expr, WindowFunctionDefinition, }; +use datafusion_functions_array; + +#[pyfunction] +pub fn isnan(expr: PyExpr) -> PyExpr { + functions::expr_fn::isnan(expr.into()).into() +} + +#[pyfunction] +pub fn nullif(expr1: PyExpr, expr2: PyExpr) -> PyExpr { + functions::expr_fn::nullif(expr1.into(), expr2.into()).into() +} + +#[pyfunction] +pub fn encode(input: PyExpr, encoding: PyExpr) -> PyExpr { + functions::expr_fn::encode(input.into(), encoding.into()).into() +} + +#[pyfunction] +pub fn decode(input: PyExpr, encoding: PyExpr) -> PyExpr { + functions::expr_fn::decode(input.into(), encoding.into()).into() +} + +#[pyfunction] +pub fn array_to_string(expr: PyExpr, delim: PyExpr) -> PyExpr { + datafusion_functions_array::expr_fn::array_to_string(expr.into(), delim.into()).into() +} + +#[pyfunction] +pub fn array_join(expr: PyExpr, delim: PyExpr) -> PyExpr { + // alias for array_to_string + array_to_string(expr, delim) +} + +#[pyfunction] +pub fn list_to_string(expr: PyExpr, delim: PyExpr) -> PyExpr { + // alias for array_to_string + array_to_string(expr, delim) +} + +#[pyfunction] +pub fn list_join(expr: PyExpr, delim: PyExpr) -> PyExpr { + // alias for array_to_string + array_to_string(expr, delim) +} #[pyfunction] fn in_list(expr: PyExpr, value: Vec, negated: bool) -> PyExpr { @@ -195,6 +241,25 @@ macro_rules! scalar_function { }; } +macro_rules! udf_scalar_function { + ($NAME: ident, $FUNC: ident) => { + udf_scalar_function!($NAME, $FUNC, stringify!($NAME)); + }; + + ($NAME: ident, $FUNC: ident, $DOC: expr) => { + #[doc = $DOC] + #[pyfunction] + #[pyo3(signature = (*args))] + fn $NAME(args: Vec) -> PyExpr { + let expr = datafusion_expr::Expr::ScalarFunction(ScalarFunction { + func_def: datafusion_expr::ScalarFunctionDefinition::UDF($FUNC::new()), + args: args.into_iter().map(|e| e.into()).collect(), + }); + expr.into() + } + }; +} + macro_rules! aggregate_function { ($NAME: ident, $FUNC: ident) => { aggregate_function!($NAME, $FUNC, stringify!($NAME)); @@ -501,6 +566,7 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(grouping))?; m.add_wrapped(wrap_pyfunction!(in_list))?; m.add_wrapped(wrap_pyfunction!(initcap))?; + m.add_wrapped(wrap_pyfunction!(isnan))?; m.add_wrapped(wrap_pyfunction!(iszero))?; m.add_wrapped(wrap_pyfunction!(lcm))?; m.add_wrapped(wrap_pyfunction!(left))?; @@ -520,6 +586,7 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(min))?; m.add_wrapped(wrap_pyfunction!(nanvl))?; m.add_wrapped(wrap_pyfunction!(now))?; + m.add_wrapped(wrap_pyfunction!(nullif))?; m.add_wrapped(wrap_pyfunction!(octet_length))?; m.add_wrapped(wrap_pyfunction!(order_by))?; m.add_wrapped(wrap_pyfunction!(pi))?; @@ -586,6 +653,10 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(bool_and))?; m.add_wrapped(wrap_pyfunction!(bool_or))?; + //Binary String Functions + m.add_wrapped(wrap_pyfunction!(encode))?; + m.add_wrapped(wrap_pyfunction!(decode))?; + // Array Functions m.add_wrapped(wrap_pyfunction!(array_append))?; m.add_wrapped(wrap_pyfunction!(array_push_back))?; @@ -610,6 +681,10 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(list_indexof))?; m.add_wrapped(wrap_pyfunction!(array_positions))?; m.add_wrapped(wrap_pyfunction!(list_positions))?; + m.add_wrapped(wrap_pyfunction!(array_to_string))?; + m.add_wrapped(wrap_pyfunction!(array_join))?; + m.add_wrapped(wrap_pyfunction!(list_to_string))?; + m.add_wrapped(wrap_pyfunction!(list_join))?; m.add_wrapped(wrap_pyfunction!(array_ndims))?; m.add_wrapped(wrap_pyfunction!(list_ndims))?; m.add_wrapped(wrap_pyfunction!(array_prepend))?; From e6e56f6f7921b644c2262087e5c45da9e00d1f29 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 Mar 2024 12:29:48 -0700 Subject: [PATCH 3/7] clippy --- src/functions.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/functions.rs b/src/functions.rs index 107433d6c..f75782e37 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,7 +16,6 @@ // under the License. use pyo3::{prelude::*, wrap_pyfunction}; -use std::sync::Arc; use crate::context::PySessionContext; use crate::errors::DataFusionError; @@ -35,7 +34,6 @@ use datafusion_expr::{ }, lit, BuiltinScalarFunction, Expr, WindowFunctionDefinition, }; -use datafusion_functions_array; #[pyfunction] pub fn isnan(expr: PyExpr) -> PyExpr { From 1cbc27cde4255f6c37bdcdfe193fc1ad199c19cf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 Mar 2024 12:56:58 -0700 Subject: [PATCH 4/7] changelog --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5e3abbe2..0a05429f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,23 @@ # DataFusion Python Changelog +## [36.0.0](https://github.com/apache/arrow-datafusion-python/tree/36.0.0) (2024-03-02) + +**Implemented enhancements:** + +- feat: Add `flatten` array function [#562](https://github.com/apache/arrow-datafusion-python/pull/562) (mobley-trent) + +**Documentation updates:** + +- docs: Add ASF attribution [#580](https://github.com/apache/arrow-datafusion-python/pull/580) (simicd) + +**Merged pull requests:** + +- Allow PyDataFrame to be used from other projects [#582](https://github.com/apache/arrow-datafusion-python/pull/582) (andygrove) +- docs: Add ASF attribution [#580](https://github.com/apache/arrow-datafusion-python/pull/580) (simicd) +- Add array functions [#560](https://github.com/apache/arrow-datafusion-python/pull/560) (ongchi) +- feat: Add `flatten` array function [#562](https://github.com/apache/arrow-datafusion-python/pull/562) (mobley-trent) + ## [35.0.0](https://github.com/apache/arrow-datafusion-python/tree/35.0.0) (2024-01-20) **Merged pull requests:** From 6c8bfd77644c29c76cca0b770e331fa8311e3c97 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 Mar 2024 12:58:15 -0700 Subject: [PATCH 5/7] remove unused macro --- src/functions.rs | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/functions.rs b/src/functions.rs index f75782e37..757fb31d1 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -239,25 +239,6 @@ macro_rules! scalar_function { }; } -macro_rules! udf_scalar_function { - ($NAME: ident, $FUNC: ident) => { - udf_scalar_function!($NAME, $FUNC, stringify!($NAME)); - }; - - ($NAME: ident, $FUNC: ident, $DOC: expr) => { - #[doc = $DOC] - #[pyfunction] - #[pyo3(signature = (*args))] - fn $NAME(args: Vec) -> PyExpr { - let expr = datafusion_expr::Expr::ScalarFunction(ScalarFunction { - func_def: datafusion_expr::ScalarFunctionDefinition::UDF($FUNC::new()), - args: args.into_iter().map(|e| e.into()).collect(), - }); - expr.into() - } - }; -} - macro_rules! aggregate_function { ($NAME: ident, $FUNC: ident) => { aggregate_function!($NAME, $FUNC, stringify!($NAME)); From baac724596c837a7e20f231f25d89141ebc8f153 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 Mar 2024 13:01:03 -0700 Subject: [PATCH 6/7] fix a todo --- src/expr.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/expr.rs b/src/expr.rs index 5224aa246..0b6093bd7 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -607,7 +607,11 @@ impl PyExpr { | Operator::RegexMatch | Operator::RegexIMatch | Operator::RegexNotMatch - | Operator::RegexNotIMatch => DataTypeMap::map_from_arrow_type(&DataType::Boolean), + | Operator::RegexNotIMatch + | Operator::LikeMatch + | Operator::ILikeMatch + | Operator::NotLikeMatch + | Operator::NotILikeMatch => DataTypeMap::map_from_arrow_type(&DataType::Boolean), Operator::Plus | Operator::Minus | Operator::Multiply | Operator::Modulo => { DataTypeMap::map_from_arrow_type(&DataType::Int64) } @@ -618,11 +622,9 @@ impl PyExpr { | Operator::BitwiseXor | Operator::BitwiseAnd | Operator::BitwiseOr => DataTypeMap::map_from_arrow_type(&DataType::Binary), - Operator::AtArrow | Operator::ArrowAt => todo!(), - Operator::LikeMatch - | Operator::ILikeMatch - | Operator::NotLikeMatch - | Operator::NotILikeMatch => todo!(), + Operator::AtArrow | Operator::ArrowAt => { + Err(py_type_err(format!("Unsupported expr: ${op}"))) + } }, Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value) => DataTypeMap::map_from_scalar_value(scalar_value), From 32ef3a2b762b6fa78cbf1821be9d6d850e8f0728 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 Mar 2024 13:01:58 -0700 Subject: [PATCH 7/7] fix a todo --- src/expr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/expr.rs b/src/expr.rs index 0b6093bd7..c0e7019f0 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -292,9 +292,9 @@ impl PyExpr { | Expr::IsNotFalse(..) | Expr::Placeholder { .. } | Expr::OuterReferenceColumn(_, _) + | Expr::Unnest(_) | Expr::IsNotUnknown(_) => RexType::Call, Expr::ScalarSubquery(..) => RexType::ScalarSubquery, - Expr::Unnest(_) => todo!(), }) }