|
16 | 16 |
|
17 | 17 | def wrangle(x, return_dtype=False, **kwargs): |
18 | 18 | """ |
19 | | - Turn messy data into clean data |
| 19 | + Turn messy data into clean pandas DataFrames |
| 20 | +
|
| 21 | + Automatically detects and converts various data types into consistent DataFrame format. |
| 22 | + Specializes in text processing using modern NLP models and handles mixed data types. |
20 | 23 |
|
21 | 24 | Parameters |
22 | 25 | ---------- |
23 | | - :param x: data in any format (text, numpy arrays, pandas dataframes, or a mixed list (or nested lists) of those |
24 | | - types). The following datatypes are supported: |
| 26 | + :param x: data in any format. Supported datatypes: |
25 | 27 | - Numpy Arrays, array-like objects, or paths to files that store array-like objects |
26 | | - - Pandas DataFrames, dataframe-like objects, or paths to files that store dataframe-like objects |
27 | | - - Images, or paths to files that store images |
28 | | - - Text, or paths to plain text files |
29 | | - - Mixed lists of the above |
30 | | - :param return_dtype: if True, also return the auto-detected datatype(s) of each dataset you wrangle |
31 | | - :param kwargs: used to control how data are wrangled (e.g., if you don't want to use the default options for each |
32 | | - data type): |
33 | | - - array_kwargs: passed to the datawrangler.zoo.array.wrangle_array function to control how arrays are handled |
34 | | - - dataframe_kwargs: passed to the datawrangler.zoo.dataframe.wrangle_dataframe function to control how |
35 | | - dataframes are handled |
36 | | - - image_kwargs: passed to the datawrangler.zoo.image.wrangle_image function to control how images are handled |
37 | | - - text_kwargs: passed to the datawrangler.zoo.text.wrangle_text function to control how text data are handled |
38 | | - any other keyword arguments are passed to *all* of the wrangle functions. |
| 28 | + - Pandas DataFrames, dataframe-like objects, or paths to files that store dataframe-like objects |
| 29 | + - Text strings, lists of strings, or paths to plain text files |
| 30 | + - Mixed lists or nested lists of the above types |
| 31 | + :param return_dtype: if True, also return the auto-detected datatype(s) of each dataset. Default: False |
| 32 | + :param kwargs: control how data are wrangled: |
| 33 | + - array_kwargs: passed to wrangle_array function to control how arrays are handled |
| 34 | + - dataframe_kwargs: passed to wrangle_dataframe function to control how dataframes are handled |
| 35 | + - text_kwargs: passed to wrangle_text function to control how text data are handled |
| 36 | + Common text_kwargs options: |
| 37 | + - {'model': 'all-MiniLM-L6-v2'} for sentence-transformers |
| 38 | + - {'model': ['CountVectorizer', 'LatentDirichletAllocation']} for sklearn pipeline |
| 39 | + Any other keyword arguments are passed to all wrangle functions. |
39 | 40 |
|
40 | 41 | Returns |
41 | 42 | ------- |
42 | 43 | :return: a DataFrame, or a list of DataFrames, containing the wrangled data |
| 44 | + |
| 45 | + Examples |
| 46 | + -------- |
| 47 | + >>> import datawrangler as dw |
| 48 | + >>> df = dw.wrangle([1, 2, 3]) # Convert array to DataFrame |
| 49 | + >>> text_df = dw.wrangle(["Hello", "World"], text_kwargs={'model': 'all-MiniLM-L6-v2'}) |
| 50 | + >>> mixed_df, dtypes = dw.wrangle([df, text_df], return_dtype=True) |
43 | 51 | """ |
44 | 52 |
|
45 | 53 | deep_kwargs = {} |
|
0 commit comments