@@ -64,9 +64,8 @@ df = session.read.parquet("hf://datasets/username/dataset_name/*.parquet", merge
6464
6565To read private datasets, you need to set your Hugging Face token as an environment variable:
6666
67- ``` python
68- import os
69- os.environ[" HF_TOKEN" ] = " your_hugging_face_token_here"
67+ ``` shell
68+ export HF_TOKEN=" your_hugging_face_token_here"
7069```
7170
7271### Path Format
@@ -113,16 +112,19 @@ Once loaded from Hugging Face, you can use fenic's full DataFrame API:
113112``` python
114113import fenic as fc
115114
116- # Load IMDB dataset from Hugging Face
117- df = session.read.parquet(" hf://datasets/imdb/plain_text/train-*.parquet" )
115+ # Create session
116+ session = fc.Session.get_or_create()
117+
118+ # Load IMDB dataset from Hugging Face
119+ df = session.read.parquet(" hf://datasets/imdb/plain_text/train-*.parquet" )
118120
119- # Filter and select
120- positive_reviews = df.filter(fc.api.functions .col(" label" ) == 1 ).select(" text" , " label" )
121+ # Filter and select
122+ positive_reviews = df.filter(fc.col(" label" ) == 1 ).select(" text" , " label" )
121123
122- # Group by and aggregate
123- label_counts = df.group_by(" label" ).agg(
124- fc.api.functions. count().alias(" count" )
125- )
124+ # Group by and aggregate
125+ label_counts = df.group_by(" label" ).agg(
126+ fc.count(" * " ).alias(" count" )
127+ )
126128```
127129
128130### AI-Powered Operations
@@ -132,55 +134,60 @@ To use semantic and embedding operations, configure language and embedding model
132134``` python
133135import fenic as fc
134136
135- # Load a text dataset from Hugging Face
136- df = session.read.parquet(" hf://datasets/imdb/plain_text/train-00000-of-00001.parquet" )
137-
138- # Add embeddings to text columns
139- df_with_embeddings = df.with_column(
140- " embedding" ,
141- fc.api.functions.embedding(" text" )
142- )
143-
144- # Apply semantic functions for sentiment analysis
145- df_analyzed = df.with_column(
146- " sentiment_score" ,
147- fc.api.functions.semantic(" Rate the sentiment from 1-10: {text} " )
148- )
137+ # Create session
138+ session = fc.Session.get_or_create()
139+
140+ # Load a text dataset from Hugging Face
141+ df = session.read.parquet(" hf://datasets/imdb/plain_text/train-00000-of-00001.parquet" )
142+
143+ # Add embeddings to text columns
144+ df_with_embeddings = df.select(
145+ " *" ,
146+ fc.semantic.embed(fc.col(" text" )).alias(" embedding" )
147+ )
148+
149+ # Apply semantic functions for sentiment analysis
150+ df_analyzed = df_with_embeddings.select(
151+ " *" ,
152+ fc.semantic.analyze_sentiment(
153+ fc.col(" text" ),
154+ model_alias = " gpt-4o-mini" # Optional: specify model
155+ ).alias(" sentiment" )
156+ )
149157```
150158
151159## Example: Analyzing MMLU Dataset
152160
153161``` python
154- import fenic as fc
155- import os
156-
157- # Set HF token if accessing private datasets
158- os.environ[" HF_TOKEN" ] = " your_token_here"
159-
160- # Load MMLU astronomy subset from Hugging Face
161- df = session.read.parquet(" hf://datasets/cais/mmlu/astronomy/*.parquet" )
162-
163- # Process the data
164- processed_df = (df
165- # Filter for specific criteria
166- .filter(fc.api.functions.col(" subject" ) == " astronomy" )
167- # Select relevant columns
168- .select(" question" , " choices" , " answer" )
169- # Add difficulty analysis (requires semantic configuration)
170- .with_column(" difficulty" ,
171- fc.api.functions.semantic(" Rate difficulty 1-5: {question} " ))
172- )
173-
174- # Show results
175- processed_df.show()
162+ import fenic as fc
163+
164+ # Create session
165+ session = fc.Session.get_or_create()
166+
167+ # Load MMLU astronomy subset from Hugging Face
168+ df = session.read.parquet(" hf://datasets/cais/mmlu/astronomy/*.parquet" )
169+
170+ # Process the data
171+ processed_df = (df
172+ # Filter for specific criteria
173+ .filter(fc.col(" subject" ) == " astronomy" )
174+ # Select relevant columns
175+ .select(" question" , " choices" , " answer" )
176+ # Add difficulty analysis using semantic.map
177+ .select(
178+ " *" ,
179+ fc.semantic.map(
180+ " Rate the difficulty of this question from 1-5: {{ question}} " ,
181+ question = fc.col(" question" ),
182+ model_alias = " gpt-4o-mini" # Optional: specify model
183+ ).alias(" difficulty" )
184+ )
185+ )
186+
187+ # Show results
188+ processed_df.show()
176189```
177190
178- ## Limitations
179-
180- - ** Writing to Hugging Face Hub** : Currently not supported. fenic can only read from the Hub.
181- - ** Supported Read Formats** : Limited to CSV and Parquet formats when reading from the Hub.
182- - ** Semantic Operations** : Require configuring language/embedding models in SessionConfig.
183-
184191## Resources
185192
186193- [ fenic GitHub Repository] ( https://github.com/typedef-ai/fenic )
0 commit comments