@@ -14,6 +14,16 @@ To get started, pip install `fenic`:
1414pip install fenic
1515```
1616
17+ ### Create a Session
18+
19+ Instantiate a fenic session with the default configuration (sufficient for reading datasets and other non-semantic operations):
20+
21+ ``` python
22+ import fenic as fc
23+
24+ session = fc.Session.get_or_create(fc.SessionConfig())
25+ ```
26+
1727## Overview
1828
1929fenic is an opinionated data processing framework that combines:
@@ -39,7 +49,8 @@ To read a dataset from the Hugging Face Hub:
3949``` python
4050import fenic as fc
4151
42- # Assuming session is already created
52+ session = fc.Session.get_or_create(fc.SessionConfig())
53+
4354# Read a CSV file from a public dataset
4455df = session.read.csv(" hf://datasets/datasets-examples/doc-formats-csv-1/data.csv" )
4556
@@ -112,19 +123,18 @@ Once loaded from Hugging Face, you can use fenic's full DataFrame API:
112123``` python
113124import fenic as fc
114125
115- # Create session
116- session = fc.Session.get_or_create()
126+ session = fc.Session.get_or_create(fc.SessionConfig())
117127
118- # Load IMDB dataset from Hugging Face
119- df = session.read.parquet(" hf://datasets/imdb/plain_text/train-*.parquet" )
128+ # Load IMDB dataset from Hugging Face
129+ df = session.read.parquet(" hf://datasets/imdb/plain_text/train-*.parquet" )
120130
121- # Filter and select
122- positive_reviews = df.filter(fc.col(" label" ) == 1 ).select(" text" , " label" )
131+ # Filter and select
132+ positive_reviews = df.filter(fc.col(" label" ) == 1 ).select(" text" , " label" )
123133
124- # Group by and aggregate
125- label_counts = df.group_by(" label" ).agg(
126- fc.count(" *" ).alias(" count" )
127- )
134+ # Group by and aggregate
135+ label_counts = df.group_by(" label" ).agg(
136+ fc.count(" *" ).alias(" count" )
137+ )
128138```
129139
130140### AI-Powered Operations
@@ -134,61 +144,92 @@ To use semantic and embedding operations, configure language and embedding model
134144``` python
135145import fenic as fc
136146
137- # Create session
138- session = fc.Session.get_or_create()
139-
140- # Load a text dataset from Hugging Face
141- df = session.read.parquet(" hf://datasets/imdb/plain_text/train-00000-of-00001.parquet" )
142-
143- # Add embeddings to text columns
144- df_with_embeddings = df.select(
145- " *" ,
146- fc.semantic.embed(fc.col(" text" )).alias(" embedding" )
147- )
148-
149- # Apply semantic functions for sentiment analysis
150- df_analyzed = df_with_embeddings.select(
151- " *" ,
152- fc.semantic.analyze_sentiment(
153- fc.col(" text" ),
154- model_alias = " gpt-4o-mini" # Optional: specify model
155- ).alias(" sentiment" )
156- )
147+ # Requires OPENAI_API_KEY to be set for language and embedding calls
148+ session = fc.Session.get_or_create(
149+ fc.SessionConfig(
150+ semantic = fc.SemanticConfig(
151+ language_models = {
152+ " gpt-4o-mini" : fc.OpenAILanguageModel(
153+ model_name = " gpt-4o-mini" ,
154+ rpm = 60 ,
155+ tpm = 60000 ,
156+ )
157+ },
158+ embedding_models = {
159+ " text-embedding-3-small" : fc.OpenAIEmbeddingModel(
160+ model_name = " text-embedding-3-small" ,
161+ rpm = 60 ,
162+ tpm = 60000 ,
163+ )
164+ },
165+ )
166+ )
167+ )
168+
169+ # Load a text dataset from Hugging Face
170+ df = session.read.parquet(" hf://datasets/imdb/plain_text/train-00000-of-00001.parquet" )
171+
172+ # Add embeddings to text columns
173+ df_with_embeddings = df.select(
174+ " *" ,
175+ fc.semantic.embed(fc.col(" text" )).alias(" embedding" )
176+ )
177+
178+ # Apply semantic functions for sentiment analysis
179+ df_analyzed = df_with_embeddings.select(
180+ " *" ,
181+ fc.semantic.analyze_sentiment(
182+ fc.col(" text" ),
183+ model_alias = " gpt-4o-mini" , # Optional: specify model
184+ ).alias(" sentiment" )
185+ )
157186```
158187
159188## Example: Analyzing MMLU Dataset
160189
161190``` python
162- import fenic as fc
163-
164- # Create session
165- session = fc.Session.get_or_create()
166-
167- # Load MMLU astronomy subset from Hugging Face
168- df = session.read.parquet(" hf://datasets/cais/mmlu/astronomy/*.parquet" )
169-
170- # Process the data
171- processed_df = (df
172- # Filter for specific criteria
173- .filter(fc.col(" subject" ) == " astronomy" )
174- # Select relevant columns
175- .select(" question" , " choices" , " answer" )
176- # Add difficulty analysis using semantic.map
177- .select(
178- " *" ,
179- fc.semantic.map(
180- " Rate the difficulty of this question from 1-5: {{ question}} " ,
181- question = fc.col(" question" ),
182- model_alias = " gpt-4o-mini" # Optional: specify model
183- ).alias(" difficulty" )
184- )
185- )
186-
187- # Show results
188- processed_df.show()
191+ import fenic as fc
192+
193+ # Requires OPENAI_API_KEY to be set for semantic calls
194+ session = fc.Session.get_or_create(
195+ fc.SessionConfig(
196+ semantic = fc.SemanticConfig(
197+ language_models = {
198+ " gpt-4o-mini" : fc.OpenAILanguageModel(
199+ model_name = " gpt-4o-mini" ,
200+ rpm = 60 ,
201+ tpm = 60000 ,
202+ )
203+ },
204+ )
205+ )
206+ )
207+
208+ # Load MMLU astronomy subset from Hugging Face
209+ df = session.read.parquet(" hf://datasets/cais/mmlu/astronomy/*.parquet" )
210+
211+ # Process the data
212+ processed_df = (df
213+ # Filter for specific criteria
214+ .filter(fc.col(" subject" ) == " astronomy" )
215+ # Select relevant columns
216+ .select(" question" , " choices" , " answer" )
217+ # Add difficulty analysis using semantic.map
218+ .select(
219+ " *" ,
220+ fc.semantic.map(
221+ " Rate the difficulty of this question from 1-5: {{ question}} " ,
222+ question = fc.col(" question" ),
223+ model_alias = " gpt-4o-mini" # Optional: specify model
224+ ).alias(" difficulty" )
225+ )
226+ )
227+
228+ # Show results
229+ processed_df.show()
189230```
190231
191232## Resources
192233
193234- [ fenic GitHub Repository] ( https://github.com/typedef-ai/fenic )
194- - [ fenic Documentation] ( https://docs.fenic.ai/latest/ )
235+ - [ fenic Documentation] ( https://docs.fenic.ai/latest/ )
0 commit comments