Skip to content

Commit 8137796

Browse files
committed
Add article summary feature with OpenAI integration
- Introduce 'summary' query parameter in /api/content/v1/parser endpoint - Integrate OpenAI API for generating article summaries - Add OpenAIKey field to Server struct and corresponding command-line flag - Update extractArticleEmulateReadability to handle summary requests - Add generateSummary method using OpenAI's GPT-4o model (turns out to be faster than even 4o mini) - Add OpenAIClient interface and mock for testing - Update README.md with new configuration options and API details This feature allows users to request a summary of extracted articles using OpenAI's GPT-4o model. To ensure secure usage, summary generation requires a valid server token. The changes include comprehensive error handling and test coverage for various scenarios, including token validation and server misconfiguration. # Conflicts: # backend/go.mod # backend/rest/server.go
1 parent c7e8d86 commit 8137796

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+6994
-2
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
| address | UKEEPER_ADDRESS | all interfaces | web server listening address |
1212
| port | UKEEPER_PORT | `8080` | web server port |
1313
| mongo_uri | MONGO_URI | none | MongoDB connection string, _required_ |
14+
| openai_key | OPENAI_KEY | none | OpenAI API key for summary generation |
1415
| frontend_dir | FRONTEND_DIR | `/srv/web` | directory with frontend files |
1516
| token | TOKEN | none | token for /content/v1/parser endpoint auth |
1617
| mongo-delay | MONGO_DELAY | `0` | mongo initial delay |
@@ -20,7 +21,7 @@
2021

2122
### API
2223

23-
GET /api/content/v1/parser?token=secret&url=http://aa.com/blah - extract content (emulate Readability API parse call)
24+
GET /api/content/v1/parser?token=secret&summary=true&url=http://aa.com/blah - extract content (emulate Readability API parse call), summary is optional and requires OpenAI key and token to be enabled
2425
POST /api/v1/extract {url: http://aa.com/blah} - extract content
2526

2627
## Development

backend/extractor/openai_mock.go

Lines changed: 82 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/extractor/readability.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,17 @@ import (
1414
"github.com/PuerkitoBio/goquery"
1515
log "github.com/go-pkgz/lgr"
1616
"github.com/mauidude/go-readability"
17+
"github.com/sashabaranov/go-openai"
1718
"go.mongodb.org/mongo-driver/bson/primitive"
1819

1920
"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
2021
)
2122

23+
//go:generate moq -out openai_mock.go . OpenAIClient
24+
type OpenAIClient interface {
25+
CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error)
26+
}
27+
2228
// Rules interface with all methods to access datastore
2329
type Rules interface {
2430
Get(ctx context.Context, rURL string) (datastore.Rule, bool)
@@ -33,10 +39,14 @@ type UReadability struct {
3339
TimeOut time.Duration
3440
SnippetSize int
3541
Rules Rules
42+
OpenAIKey string
43+
44+
openAIClient OpenAIClient
3645
}
3746

3847
// Response from api calls
3948
type Response struct {
49+
Summary string `json:"summary,omitempty"`
4050
Content string `json:"content"`
4151
Rich string `json:"rich_content"`
4252
Domain string `json:"domain"`
@@ -68,6 +78,37 @@ func (f *UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *d
6878
return f.extractWithRules(ctx, reqURL, rule)
6979
}
7080

81+
func (f *UReadability) GenerateSummary(ctx context.Context, content string) (string, error) {
82+
if f.OpenAIKey == "" {
83+
return "", fmt.Errorf("OpenAI key is not set")
84+
}
85+
if f.openAIClient == nil {
86+
f.openAIClient = openai.NewClient(f.OpenAIKey)
87+
}
88+
resp, err := f.openAIClient.CreateChatCompletion(
89+
ctx,
90+
openai.ChatCompletionRequest{
91+
Model: openai.GPT4o,
92+
Messages: []openai.ChatCompletionMessage{
93+
{
94+
Role: openai.ChatMessageRoleSystem,
95+
Content: "You are a helpful assistant that summarizes articles. Please summarize the main points in a few sentences as TLDR style (don't add a TLDR label). Then, list up to five detailed bullet points. Provide the response in plain text. Do not add any additional information. Do not add a Summary at the beginning of the response. If detailed bullet points are too similar to the summary, don't include them at all:",
96+
},
97+
{
98+
Role: openai.ChatMessageRoleUser,
99+
Content: content,
100+
},
101+
},
102+
},
103+
)
104+
105+
if err != nil {
106+
return "", err
107+
}
108+
109+
return resp.Choices[0].Message.Content, nil
110+
}
111+
71112
// ExtractWithRules is the core function that handles extraction with or without a specific rule
72113
func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
73114
log.Printf("[INFO] extract %s", reqURL)

backend/extractor/readability_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"testing"
1212
"time"
1313

14+
"github.com/sashabaranov/go-openai"
1415
"github.com/stretchr/testify/assert"
1516
"github.com/stretchr/testify/require"
1617
"go.mongodb.org/mongo-driver/bson/primitive"
@@ -207,3 +208,61 @@ func TestGetContentCustom(t *testing.T) {
207208
assert.Len(t, content, 6988)
208209
assert.Len(t, rich, 7169)
209210
}
211+
212+
func TestUReadability_GenerateSummary(t *testing.T) {
213+
mockOpenAI := &OpenAIClientMock{
214+
CreateChatCompletionFunc: func(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error) {
215+
return openai.ChatCompletionResponse{
216+
Choices: []openai.ChatCompletionChoice{
217+
{
218+
Message: openai.ChatCompletionMessage{
219+
Content: "This is a summary of the article.",
220+
},
221+
},
222+
},
223+
}, nil
224+
},
225+
}
226+
227+
tests := []struct {
228+
name string
229+
content string
230+
openAIKey string
231+
expectedResult string
232+
expectedError string
233+
}{
234+
{
235+
name: "Valid OpenAI Key and content",
236+
content: "This is a test article content.",
237+
openAIKey: "test-key",
238+
expectedResult: "This is a summary of the article.",
239+
expectedError: "",
240+
},
241+
{
242+
name: "No OpenAI Key",
243+
content: "This is a test article content.",
244+
openAIKey: "",
245+
expectedResult: "",
246+
expectedError: "OpenAI key is not set",
247+
},
248+
}
249+
250+
for _, tt := range tests {
251+
t.Run(tt.name, func(t *testing.T) {
252+
readability := UReadability{
253+
OpenAIKey: tt.openAIKey,
254+
openAIClient: mockOpenAI,
255+
}
256+
257+
result, err := readability.GenerateSummary(context.Background(), tt.content)
258+
259+
if tt.expectedError != "" {
260+
require.Error(t, err)
261+
assert.Contains(t, err.Error(), tt.expectedError)
262+
} else {
263+
require.NoError(t, err)
264+
assert.Equal(t, tt.expectedResult, result)
265+
}
266+
})
267+
}
268+
}

backend/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ require (
1212
github.com/jessevdk/go-flags v1.6.1
1313
github.com/kennygrant/sanitize v1.2.4
1414
github.com/mauidude/go-readability v0.0.0-20220221173116-a9b3620098b7
15+
github.com/sashabaranov/go-openai v1.38.2
1516
github.com/stretchr/testify v1.10.0
1617
go.mongodb.org/mongo-driver v1.17.3
1718
golang.org/x/net v0.38.0

backend/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6So
168168
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
169169
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
170170
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
171+
github.com/sashabaranov/go-openai v1.38.2 h1:akrssjj+6DY3lWuDwHv6cBvJ8Z+FZDM9XEaaYFt0Auo=
172+
github.com/sashabaranov/go-openai v1.38.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
171173
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
172174
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
173175
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=

backend/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ var opts struct {
2727
MongoURI string `short:"m" long:"mongo_uri" env:"MONGO_URI" required:"true" description:"MongoDB connection string"`
2828
MongoDelay time.Duration `long:"mongo-delay" env:"MONGO_DELAY" default:"0" description:"mongo initial delay"`
2929
MongoDB string `long:"mongo-db" env:"MONGO_DB" default:"ureadability" description:"mongo database name"`
30+
OpenAIKey string `long:"openai_key" env:"OPENAI_KEY" description:"OpenAI API key for summary generation"`
3031
Debug bool `long:"dbg" env:"DEBUG" description:"debug mode"`
3132
}
3233

@@ -47,7 +48,7 @@ func main() {
4748
log.Fatalf("[ERROR] can't connect to mongo %v", err)
4849
}
4950
srv := rest.Server{
50-
Readability: extractor.UReadability{TimeOut: 30 * time.Second, SnippetSize: 300, Rules: db.GetStores()},
51+
Readability: extractor.UReadability{TimeOut: 30 * time.Second, SnippetSize: 300, Rules: db.GetStores(), OpenAIKey: opts.OpenAIKey},
5152
Token: opts.Token,
5253
Credentials: opts.Credentials,
5354
Version: revision,

backend/rest/server.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"net/http"
1010
"os"
1111
"path/filepath"
12+
"strconv"
1213
"strings"
1314
"time"
1415

@@ -176,11 +177,27 @@ func (s *Server) extractArticle(w http.ResponseWriter, r *http.Request) {
176177
// if token is not set for application, it won't be checked
177178
func (s *Server) extractArticleEmulateReadability(w http.ResponseWriter, r *http.Request) {
178179
token := r.URL.Query().Get("token")
180+
summary, _ := strconv.ParseBool(r.URL.Query().Get("summary"))
181+
179182
if s.Token != "" && token == "" {
180183
rest.SendErrorJSON(w, r, log.Default(), http.StatusExpectationFailed, nil, "no token passed")
181184
return
182185
}
183186

187+
// Check if summary is requested but token is not provided, or OpenAI key is not set
188+
if summary {
189+
if s.Readability.OpenAIKey == "" {
190+
render.Status(r, http.StatusBadRequest)
191+
render.JSON(w, r, JSON{"error": "OpenAI key is not set"})
192+
return
193+
}
194+
if s.Token == "" {
195+
render.Status(r, http.StatusBadRequest)
196+
render.JSON(w, r, JSON{"error": "summary generation requires token, but token is not set for the server"})
197+
return
198+
}
199+
}
200+
184201
if s.Token != "" && s.Token != token {
185202
rest.SendErrorJSON(w, r, log.Default(), http.StatusUnauthorized, nil, "wrong token passed")
186203
return
@@ -198,6 +215,16 @@ func (s *Server) extractArticleEmulateReadability(w http.ResponseWriter, r *http
198215
return
199216
}
200217

218+
if summary {
219+
summaryText, err := s.Readability.GenerateSummary(r.Context(), res.Content)
220+
if err != nil {
221+
render.Status(r, http.StatusInternalServerError)
222+
render.JSON(w, r, JSON{"error": fmt.Sprintf("failed to generate summary: %v", err)})
223+
return
224+
}
225+
res.Summary = summaryText
226+
}
227+
201228
rest.RenderJSON(w, &res)
202229
}
203230

@@ -237,6 +264,13 @@ func (s *Server) handlePreview(w http.ResponseWriter, r *http.Request) {
237264
continue
238265
}
239266

267+
if s.Readability.OpenAIKey != "" {
268+
result.Summary, e = s.Readability.GenerateSummary(r.Context(), result.Content)
269+
if e != nil {
270+
log.Printf("[WARN] failed to generate summary for preview of %s: %v", url, e)
271+
}
272+
}
273+
240274
responses = append(responses, *result)
241275
}
242276

@@ -247,6 +281,7 @@ func (s *Server) handlePreview(w http.ResponseWriter, r *http.Request) {
247281
Excerpt string
248282
Rich template.HTML
249283
Content string
284+
Summary template.HTML
250285
}
251286

252287
results := make([]result, 0, len(responses))
@@ -258,6 +293,8 @@ func (s *Server) handlePreview(w http.ResponseWriter, r *http.Request) {
258293
//nolint:gosec // this content is escaped by Extractor, so it's safe to use it as is
259294
Rich: template.HTML(r.Rich),
260295
Content: r.Content,
296+
//nolint: gosec // we do not expect CSS from OpenAI response
297+
Summary: template.HTML(strings.ReplaceAll(r.Summary, "\n", "<br>")),
261298
})
262299
}
263300

0 commit comments

Comments
 (0)