apache
diff --git a/‎core/pom.xml‎
Lines changed: 5 additions & 0 deletions b/‎core/pom.xml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/src/main/java/org/apache/datafusion/AvroReadOptions.java‎
Lines changed: 60 additions & 0 deletions b/‎core/src/main/java/org/apache/datafusion/AvroReadOptions.java‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎core/src/main/java/org/apache/datafusion/DataFrame.java‎
Lines changed: 120 additions & 0 deletions b/‎core/src/main/java/org/apache/datafusion/DataFrame.java‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎core/src/main/java/org/apache/datafusion/JsonWriteOptions.java‎
Lines changed: 91 additions & 0 deletions b/‎core/src/main/java/org/apache/datafusion/JsonWriteOptions.java‎
Lines changed: 91 additions & 0 deletions
@@ -57,6 +57,11 @@ under the License.
             <groupId>com.google.protobuf</groupId>
             <artifactId>protobuf-java</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <build>
 
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datafusion;
+
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.apache.datafusion.protobuf.AvroReadOptionsProto;
+
+/**
+ * Configuration knobs for Avro sources passed to {@link SessionContext#registerAvro(String, String,
+ * AvroReadOptions)} and {@link SessionContext#readAvro(String, AvroReadOptions)}.
+ *
+ * <p>Mirrors the subset of DataFusion's {@code AvroReadOptions} that maps onto the Java surface
+ * today: {@code fileExtension} (default {@code ".avro"}) and an explicit Arrow {@code schema} that
+ * bypasses on-read schema inference. {@code tablePartitionCols} is intentionally deferred -- no
+ * other Java reader exposes Hive-style partitioning yet.
+ *
+ * <p>Avro carries its own per-block compression (snappy, deflate, bzip2, xz, zstandard) inside the
+ * object container itself, negotiated when the file is written, so unlike CSV / NDJSON there is no
+ * {@code FileCompressionType} setter.
+ */
+public final class AvroReadOptions {
+
+  private String fileExtension = ".avro";
+  private Schema schema;
+
+  public AvroReadOptions fileExtension(String ext) {
+    this.fileExtension = ext;
+    return this;
+  }
+
+  public AvroReadOptions schema(Schema schema) {
+    this.schema = schema;
+    return this;
+  }
+
+  byte[] toBytes() {
+    return AvroReadOptionsProto.newBuilder().setFileExtension(fileExtension).build().toByteArray();
+  }
+
+  Schema schema() {
+    return schema;
+  }
+}
@@ -19,10 +19,17 @@
 
 package org.apache.datafusion;
 
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.channels.Channels;
+
 import org.apache.arrow.c.ArrowArrayStream;
 import org.apache.arrow.c.Data;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.ipc.ArrowReader;
+import org.apache.arrow.vector.ipc.ReadChannel;
+import org.apache.arrow.vector.ipc.message.MessageSerializer;
+import org.apache.arrow.vector.types.pojo.Schema;
 
 /**
  * A lazy representation of a query plan, mirroring the Rust DataFusion {@code DataFrame}. Created
@@ -106,6 +113,77 @@ public ArrowReader executeStream(BufferAllocator allocator) {
     }
   }
 
+  /**
+   * Return the Arrow {@link Schema} of this DataFrame's output. Non-consuming: the receiver remains
+   * usable and must still be closed independently. Schema inspection does not execute the plan.
+   *
+   * <p>The schema is transferred via Arrow IPC; no {@link BufferAllocator} is required because a
+   * schema carries no buffer data.
+   */
+  public Schema schema() {
+    if (nativeHandle == 0) {
+      throw new IllegalStateException("DataFrame is closed or already collected");
+    }
+    byte[] ipcBytes = schemaIpc(nativeHandle);
+    try {
+      return MessageSerializer.deserializeSchema(
+          new ReadChannel(Channels.newChannel(new ByteArrayInputStream(ipcBytes))));
+    } catch (IOException e) {
+      throw new RuntimeException("Failed to deserialize IPC schema", e);
+    }
+  }
+
+  /**
+   * Return a new DataFrame whose rows describe the plan that would execute this DataFrame.
+   * Non-consuming: the receiver remains usable and must still be closed independently.
+   *
+   * <p>With {@code verbose=false} and {@code analyze=false} (the cheap, lazy variant), the result
+   * contains the logical plan only. {@code verbose=true} adds optimised-plan and physical-plan
+   * rows; {@code analyze=true} runs the plan and attaches per-operator metrics. Render via {@link
+   * #show()} or {@link #collect(BufferAllocator)}.
+   */
+  public DataFrame explain(boolean verbose, boolean analyze) {
+    if (nativeHandle == 0) {
+      throw new IllegalStateException("DataFrame is closed or already collected");
+    }
+    return new DataFrame(explainPlan(nativeHandle, verbose, analyze));
+  }
+
+  /**
+   * Materialise this DataFrame into an in-memory table and return a new DataFrame that scans it.
+   * Non-consuming: the receiver remains usable and must still be closed independently.
+   *
+   * <p>Executes the plan eagerly: the entire result set is held in native memory until the returned
+   * DataFrame is closed. Suitable for intermediate results that will be reused across multiple
+   * downstream queries.
+   *
+   * @throws RuntimeException if execution fails.
+   */
+  public DataFrame cache() {
+    if (nativeHandle == 0) {
+      throw new IllegalStateException("DataFrame is closed or already collected");
+    }
+    return new DataFrame(cachePlan(nativeHandle));
+  }
+
+  /**
+   * Compute summary statistics (count, null_count, mean, std, min, max, median) over this
+   * DataFrame's columns and return them as a new DataFrame. Non-consuming: the receiver remains
+   * usable and must still be closed independently.
+   *
+   * <p>Executes the plan: DataFusion runs seven aggregate sub-plans against this DataFrame to build
+   * the summary table. Numeric columns receive every statistic; non-numeric columns receive {@code
+   * count} / {@code null_count} / {@code min} / {@code max} where applicable.
+   *
+   * @throws RuntimeException if execution fails.
+   */
+  public DataFrame describe() {
+    if (nativeHandle == 0) {
+      throw new IllegalStateException("DataFrame is closed or already collected");
+    }
+    return new DataFrame(describePlan(nativeHandle));
+  }
+
   /** Execute the plan and return the number of rows. */
   public long count() {
     if (nativeHandle == 0) {
@@ -399,6 +477,38 @@ public void writeCsv(String path, CsvWriteOptions options) {
     writeCsvWithOptions(nativeHandle, path, options.toBytes());
   }
 
+  /**
+   * Materialize this DataFrame as newline-delimited JSON at {@code path}. The path is treated as a
+   * directory unless overridden via {@link JsonWriteOptions#singleFileOutput(boolean)}. The
+   * receiver remains usable and must still be closed independently.
+   *
+   * @throws RuntimeException if the write fails.
+   */
+  public void writeJson(String path) {
+    writeJson(path, new JsonWriteOptions());
+  }
+
+  /**
+   * Materialize this DataFrame as newline-delimited JSON at {@code path} with the supplied {@link
+   * JsonWriteOptions}. The receiver remains usable and must still be closed independently.
+   *
+   * @throws IllegalArgumentException if {@code path} or {@code options} is {@code null}.
+   * @throws RuntimeException if the write fails (path inaccessible, invalid compression spec,
+   *     etc.).
+   */
+  public void writeJson(String path, JsonWriteOptions options) {
+    if (nativeHandle == 0) {
+      throw new IllegalStateException("DataFrame is closed or already collected");
+    }
+    if (path == null) {
+      throw new IllegalArgumentException("writeJson path must be non-null");
+    }
+    if (options == null) {
+      throw new IllegalArgumentException("writeJson options must be non-null");
+    }
+    writeJsonWithOptions(nativeHandle, path, options.toBytes());
+  }
+
   @Override
   public void close() {
     if (nativeHandle != 0) {
@@ -415,6 +525,14 @@ public void close() {
 
   private static native long countRows(long handle);
 
+  private static native byte[] schemaIpc(long handle);
+
+  private static native long explainPlan(long handle, boolean verbose, boolean analyze);
+
+  private static native long cachePlan(long handle);
+
+  private static native long describePlan(long handle);
+
   private static native void showDataFrame(long handle);
 
   private static native void showDataFrameWithLimit(long handle, int limit);
@@ -450,4 +568,6 @@ private static native void writeParquetWithOptions(
       boolean singleFileOutputValue);
 
   private static native void writeCsvWithOptions(long handle, String path, byte[] optionsBytes);
+
+  private static native void writeJsonWithOptions(long handle, String path, byte[] optionsBytes);
 }
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datafusion;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Configuration knobs for writing JSON, passed to {@link DataFrame#writeJson(String,
+ * JsonWriteOptions)}.
+ *
+ * <p>Mirrors a subset of DataFusion's {@code DataFrameWriteOptions} and the writer-side {@code
+ * JsonOptions}. All setters return {@code this} for fluent chaining. Defaults: every field {@code
+ * null} or empty (meaning the DataFusion default is used).
+ *
+ * <p>Path semantics: when {@link #singleFileOutput(boolean)} is {@code true}, the path passed to
+ * {@code writeJson} is the literal output filename. When left unset (the default) and there are no
+ * partition columns, the path is treated as a directory that DataFusion populates with one or more
+ * part-files.
+ *
+ * <p>The output is always newline-delimited JSON (NDJSON). DataFusion's JSON writer does not emit
+ * the bracketed array form, so there is no toggle for it here.
+ *
+ * <p>Compression reuses {@link FileCompressionType} -- the same codec set ({@code UNCOMPRESSED},
+ * {@code GZIP}, {@code BZIP2}, {@code XZ}, {@code ZSTD}) the read side and the CSV writer accept.
+ */
+public final class JsonWriteOptions {
+
+  private Boolean singleFileOutput;
+  private final List<String> partitionCols = new ArrayList<>();
+  private FileCompressionType fileCompressionType;
+
+  /**
+   * When {@code true}, write to a single file at the supplied path. When left unset (the default)
+   * and no partition columns are configured, the path is treated as a directory and DataFusion
+   * writes one or more part-files.
+   */
+  public JsonWriteOptions singleFileOutput(boolean v) {
+    this.singleFileOutput = v;
+    return this;
+  }
+
+  /**
+   * Hive-style partition columns. Each column listed here is removed from the data rows and encoded
+   * into the directory layout (one subdirectory per distinct value). Mutually exclusive with {@link
+   * #singleFileOutput(boolean)} -- DataFusion rejects the combination at write time.
+   */
+  public JsonWriteOptions partitionCols(String... cols) {
+    this.partitionCols.clear();
+    for (String c : cols) {
+      this.partitionCols.add(c);
+    }
+    return this;
+  }
+
+  /** Output compression codec. Defaults to uncompressed. */
+  public JsonWriteOptions fileCompressionType(FileCompressionType t) {
+    this.fileCompressionType = t;
+    return this;
+  }
+
+  byte[] toBytes() {
+    org.apache.datafusion.protobuf.JsonWriteOptionsProto.Builder b =
+        org.apache.datafusion.protobuf.JsonWriteOptionsProto.newBuilder();
+    if (singleFileOutput != null) {
+      b.setSingleFileOutput(singleFileOutput);
+    }
+    b.addAllPartitionCols(partitionCols);
+    if (fileCompressionType != null) {
+      b.setFileCompressionType(FileCompressionTypes.toProto(fileCompressionType));
+    }
+    return b.build().toByteArray();
+  }
+}