@@ -237,21 +237,45 @@ public static Schema infer(List<Object[]> data, String[] headers, int rowLimit)
237237 * (direct data, files, or URLs), creating tables from each source, and then inferring
238238 * schemas from those tables. All inferred schemas must be equal, otherwise an exception
239239 * is thrown.
240+ * This method can incur a significant performance penalty for large data sets, in that case use the
241+ * overloaded method with a row limit.
240242 *
241243 * @param data Direct data source - can be a String containing table data or an ArrayNode
242244 * containing JSON representation of table data. May be null if using file or URL sources.
243- * @param charset The character encoding to use when reading from URLs. Used for URL streams only.
244- * @return The inferred Schema that is consistent across all provided data sources
245+ * @param charset The character encoding to use when reading from URLs. Used for URL streams only.
246+ *
247+ * @return The inferred Schema that is consistent across all provided data sources
248+ * @throws IllegalStateException if no valid data source is provided, if the data type is not supported,
249+ * or if schemas inferred from different sources are not equal
250+ * @throws RuntimeException if an IOException occurs while reading from files or URLs
251+ */
252+ public static Schema infer (Object data , Charset charset ) {
253+ return infer (data , charset , -1 );
254+ }
255+
256+ /**
257+ * Infers a table schema from various data sources.
258+ *
259+ * This method attempts to infer a schema by reading data from one or more sources
260+ * (direct data, files, or URLs), creating tables from each source, and then inferring
261+ * schemas from those tables. All inferred schemas must be equal, otherwise an exception
262+ * is thrown.
263+ *
264+ * @param data Direct data source - can be a String containing table data or an ArrayNode
265+ * containing JSON representation of table data. May be null if using file or URL sources.
266+ * @param charset The character encoding to use when reading from URLs. Used for URL streams only.
267+ * @param rowLimit The max numer of rows to scan. Huge input files can take a considerable time to infer.
268+ * @return The inferred Schema that is consistent across all provided data sources
245269 * @throws IllegalStateException if no valid data source is provided, if the data type is not supported,
246270 * or if schemas inferred from different sources are not equal
247271 * @throws RuntimeException if an IOException occurs while reading from files or URLs
248272 */
249273 public static Schema infer (
250274 Object data ,
251- Charset charset ) {
275+ Charset charset ,
276+ int rowLimit ) {
252277 List <File > paths = new ArrayList <>();
253278 List <URL > urls = new ArrayList <>();
254- // Infer schema from data source
255279 List <String > s = new ArrayList <>();
256280 if (data != null ) {
257281 if (data instanceof String ) {
@@ -349,7 +373,7 @@ public static Schema infer(
349373 for (String str : s ) {
350374 Table table = Table .fromSource (str );
351375 String [] headers = table .getHeaders ();
352- Schema schema = table .inferSchema (headers , - 1 );
376+ Schema schema = table .inferSchema (headers , rowLimit );
353377 schemas .add (schema );
354378 }
355379 Schema lastSchema = null ;
0 commit comments