1919
2020package org .apache .datafusion ;
2121
22+ import java .io .ByteArrayInputStream ;
23+ import java .io .IOException ;
24+ import java .nio .channels .Channels ;
25+
2226import org .apache .arrow .c .ArrowArrayStream ;
2327import org .apache .arrow .c .Data ;
2428import org .apache .arrow .memory .BufferAllocator ;
2529import org .apache .arrow .vector .ipc .ArrowReader ;
30+ import org .apache .arrow .vector .ipc .ReadChannel ;
31+ import org .apache .arrow .vector .ipc .message .MessageSerializer ;
32+ import org .apache .arrow .vector .types .pojo .Schema ;
2633
2734/**
2835 * A lazy representation of a query plan, mirroring the Rust DataFusion {@code DataFrame}. Created
@@ -106,6 +113,77 @@ public ArrowReader executeStream(BufferAllocator allocator) {
106113 }
107114 }
108115
116+ /**
117+ * Return the Arrow {@link Schema} of this DataFrame's output. Non-consuming: the receiver remains
118+ * usable and must still be closed independently. Schema inspection does not execute the plan.
119+ *
120+ * <p>The schema is transferred via Arrow IPC; no {@link BufferAllocator} is required because a
121+ * schema carries no buffer data.
122+ */
123+ public Schema schema () {
124+ if (nativeHandle == 0 ) {
125+ throw new IllegalStateException ("DataFrame is closed or already collected" );
126+ }
127+ byte [] ipcBytes = schemaIpc (nativeHandle );
128+ try {
129+ return MessageSerializer .deserializeSchema (
130+ new ReadChannel (Channels .newChannel (new ByteArrayInputStream (ipcBytes ))));
131+ } catch (IOException e ) {
132+ throw new RuntimeException ("Failed to deserialize IPC schema" , e );
133+ }
134+ }
135+
136+ /**
137+ * Return a new DataFrame whose rows describe the plan that would execute this DataFrame.
138+ * Non-consuming: the receiver remains usable and must still be closed independently.
139+ *
140+ * <p>With {@code verbose=false} and {@code analyze=false} (the cheap, lazy variant), the result
141+ * contains the logical plan only. {@code verbose=true} adds optimised-plan and physical-plan
142+ * rows; {@code analyze=true} runs the plan and attaches per-operator metrics. Render via {@link
143+ * #show()} or {@link #collect(BufferAllocator)}.
144+ */
145+ public DataFrame explain (boolean verbose , boolean analyze ) {
146+ if (nativeHandle == 0 ) {
147+ throw new IllegalStateException ("DataFrame is closed or already collected" );
148+ }
149+ return new DataFrame (explainPlan (nativeHandle , verbose , analyze ));
150+ }
151+
152+ /**
153+ * Materialise this DataFrame into an in-memory table and return a new DataFrame that scans it.
154+ * Non-consuming: the receiver remains usable and must still be closed independently.
155+ *
156+ * <p>Executes the plan eagerly: the entire result set is held in native memory until the returned
157+ * DataFrame is closed. Suitable for intermediate results that will be reused across multiple
158+ * downstream queries.
159+ *
160+ * @throws RuntimeException if execution fails.
161+ */
162+ public DataFrame cache () {
163+ if (nativeHandle == 0 ) {
164+ throw new IllegalStateException ("DataFrame is closed or already collected" );
165+ }
166+ return new DataFrame (cachePlan (nativeHandle ));
167+ }
168+
169+ /**
170+ * Compute summary statistics (count, null_count, mean, std, min, max, median) over this
171+ * DataFrame's columns and return them as a new DataFrame. Non-consuming: the receiver remains
172+ * usable and must still be closed independently.
173+ *
174+ * <p>Executes the plan: DataFusion runs seven aggregate sub-plans against this DataFrame to build
175+ * the summary table. Numeric columns receive every statistic; non-numeric columns receive {@code
176+ * count} / {@code null_count} / {@code min} / {@code max} where applicable.
177+ *
178+ * @throws RuntimeException if execution fails.
179+ */
180+ public DataFrame describe () {
181+ if (nativeHandle == 0 ) {
182+ throw new IllegalStateException ("DataFrame is closed or already collected" );
183+ }
184+ return new DataFrame (describePlan (nativeHandle ));
185+ }
186+
109187 /** Execute the plan and return the number of rows. */
110188 public long count () {
111189 if (nativeHandle == 0 ) {
@@ -399,6 +477,38 @@ public void writeCsv(String path, CsvWriteOptions options) {
399477 writeCsvWithOptions (nativeHandle , path , options .toBytes ());
400478 }
401479
480+ /**
481+ * Materialize this DataFrame as newline-delimited JSON at {@code path}. The path is treated as a
482+ * directory unless overridden via {@link JsonWriteOptions#singleFileOutput(boolean)}. The
483+ * receiver remains usable and must still be closed independently.
484+ *
485+ * @throws RuntimeException if the write fails.
486+ */
487+ public void writeJson (String path ) {
488+ writeJson (path , new JsonWriteOptions ());
489+ }
490+
491+ /**
492+ * Materialize this DataFrame as newline-delimited JSON at {@code path} with the supplied {@link
493+ * JsonWriteOptions}. The receiver remains usable and must still be closed independently.
494+ *
495+ * @throws IllegalArgumentException if {@code path} or {@code options} is {@code null}.
496+ * @throws RuntimeException if the write fails (path inaccessible, invalid compression spec,
497+ * etc.).
498+ */
499+ public void writeJson (String path , JsonWriteOptions options ) {
500+ if (nativeHandle == 0 ) {
501+ throw new IllegalStateException ("DataFrame is closed or already collected" );
502+ }
503+ if (path == null ) {
504+ throw new IllegalArgumentException ("writeJson path must be non-null" );
505+ }
506+ if (options == null ) {
507+ throw new IllegalArgumentException ("writeJson options must be non-null" );
508+ }
509+ writeJsonWithOptions (nativeHandle , path , options .toBytes ());
510+ }
511+
402512 @ Override
403513 public void close () {
404514 if (nativeHandle != 0 ) {
@@ -415,6 +525,14 @@ public void close() {
415525
416526 private static native long countRows (long handle );
417527
528+ private static native byte [] schemaIpc (long handle );
529+
530+ private static native long explainPlan (long handle , boolean verbose , boolean analyze );
531+
532+ private static native long cachePlan (long handle );
533+
534+ private static native long describePlan (long handle );
535+
418536 private static native void showDataFrame (long handle );
419537
420538 private static native void showDataFrameWithLimit (long handle , int limit );
@@ -450,4 +568,6 @@ private static native void writeParquetWithOptions(
450568 boolean singleFileOutputValue );
451569
452570 private static native void writeCsvWithOptions (long handle , String path , byte [] optionsBytes );
571+
572+ private static native void writeJsonWithOptions (long handle , String path , byte [] optionsBytes );
453573}
0 commit comments