Skip to content

Commit a649e3c

Browse files
fix: confluence page-body limit handling and selector warnings (#355)
What was wrong - When a page body was bigger than --max-page-body-megabytes, the client skipped it before the plugin saw it. - Because of that, the plugin never marked the page ID / space ID as “seen”. - Result: valid page IDs/space IDs could show up in the “don’t exist / no access” warning. What changed - Page-body size limit is now enforced in the plugin, not inside the client visitor. - We always mark the page as “seen” (returnedPageIDs / returnedSpaceIDs) before deciding to skip its content. - Warning about missing selectors now only reports IDs/keys that truly returned no pages or were invalid. Behaviour - Big pages are skipped for content, but not treated as missing/inaccessible. - Total scan / per-response limits behaviour is unchanged. Extra - Updated sort parameter to use modified date instead of created date
1 parent dd74671 commit a649e3c

File tree

3 files changed

+41
-188
lines changed

3 files changed

+41
-188
lines changed

plugins/confluence.go

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,8 @@ func (p *ConfluencePlugin) initialize(base, username, token string) error {
228228
token,
229229
WithMaxAPIResponseBytes(p.maxAPIResponseBytes),
230230
WithMaxTotalScanBytes(p.maxTotalScanBytes),
231-
WithMaxPageBodyBytes(p.maxPageBodyBytes),
231+
// Page-body size limit is enforced in the plugin layer so that
232+
// large-but-accessible pages still count as "seen".
232233
)
233234
if err != nil {
234235
return err
@@ -383,6 +384,26 @@ func (p *ConfluencePlugin) emitInChunks(page *Page) error {
383384
}
384385
}
385386

387+
// shouldSkipPageBody applies the per-page body size limit (if configured).
388+
// It logs a warning when the limit is exceeded and returns true to indicate
389+
// that the page's content (and history) should be skipped.
390+
func (p *ConfluencePlugin) shouldSkipPageBody(page *Page) bool {
391+
if p.maxPageBodyBytes == 0 || page == nil || page.Body.Storage == nil {
392+
return false
393+
}
394+
395+
bodySize := int64(len(page.Body.Storage.Value))
396+
if bodySize <= p.maxPageBodyBytes {
397+
return false
398+
}
399+
400+
log.Warn().
401+
Str("page_id", page.ID).
402+
Int64("body_bytes", bodySize).
403+
Msg("Skipping page content because the Confluence page body exceeded the configured size limit.")
404+
return true
405+
}
406+
386407
// emitUniquePage emits the current version of a page (and, if enabled, its historical versions)
387408
// ensuring each page ID is emitted only once. Also records the page SpaceID for selector warnings.
388409
func (p *ConfluencePlugin) emitUniquePage(ctx context.Context, page *Page) error {
@@ -395,6 +416,12 @@ func (p *ConfluencePlugin) emitUniquePage(ctx context.Context, page *Page) error
395416
p.returnedSpaceIDs[page.SpaceID] = struct{}{}
396417
}
397418

419+
// Enforce page body size limit after marking the page as seen, so that
420+
// large-but-accessible pages are not misreported as missing or unauthorized.
421+
if p.shouldSkipPageBody(page) {
422+
return nil
423+
}
424+
398425
// current version
399426
if err := p.emitInChunks(page); err != nil {
400427
return err
@@ -420,6 +447,9 @@ func (p *ConfluencePlugin) emitHistory(ctx context.Context, page *Page) error {
420447
if err != nil {
421448
return err
422449
}
450+
if p.shouldSkipPageBody(versionedPage) {
451+
return nil
452+
}
423453
return p.emitInChunks(versionedPage)
424454
})
425455
}

plugins/confluence_client.go

Lines changed: 10 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@ import (
1212
"strconv"
1313
"strings"
1414
"time"
15-
16-
"github.com/rs/zerolog/log"
1715
)
1816

1917
const (
@@ -72,7 +70,6 @@ type httpConfluenceClient struct {
7270
// Optional limits (0 means "no limit").
7371
maxAPIResponseBytes int64
7472
maxTotalScanBytes int64
75-
maxPageBodyBytes int64
7673

7774
totalScanBytes int64
7875

@@ -107,18 +104,6 @@ func WithMaxTotalScanBytes(n int64) ConfluenceClientOption {
107104
}
108105
}
109106

110-
// WithMaxPageBodyBytes sets an optional limit in bytes for individual page bodies.
111-
// Pages whose storage body exceeds this limit are skipped and logged as warnings.
112-
// A value of 0 disables the limit. Negative values are treated as 0.
113-
func WithMaxPageBodyBytes(n int64) ConfluenceClientOption {
114-
if n < 0 {
115-
n = 0
116-
}
117-
return func(c *httpConfluenceClient) {
118-
c.maxPageBodyBytes = n
119-
}
120-
}
121-
122107
// NewConfluenceClient constructs a ConfluenceClient for the given base input URL.
123108
// Behavior:
124109
// - Normalizes base wiki URL to "https://{host}/wiki".
@@ -243,7 +228,7 @@ func (c *httpConfluenceClient) walkPagesWithFilter(
243228
q := apiURL.Query()
244229
q.Set("limit", strconv.Itoa(limit))
245230
q.Set("body-format", "storage")
246-
q.Set("sort", "-created-date") // Newest created date to oldest
231+
q.Set("sort", "-modified-date") // Newest modified date to oldest
247232

248233
if filterKey != "" && len(filterValues) > 0 {
249234
q.Set(filterKey, strings.Join(filterValues, ","))
@@ -383,9 +368,8 @@ func (c *httpConfluenceClient) walkPagesPaginated(
383368
linkNext := nextURLFromLinkHeader(headers)
384369

385370
reader := c.wrapWithCountingReader(rc)
386-
limitedVisit := c.wrapWithPageBodyLimit(visit)
387371

388-
bodyNext, decodeErr := streamPagesFromBody(reader, limitedVisit)
372+
bodyNext, decodeErr := streamPagesFromBody(reader, visit)
389373
closeErr := rc.Close()
390374

391375
if decodeErr != nil {
@@ -435,30 +419,6 @@ func (c *httpConfluenceClient) wrapWithCountingReader(rc io.ReadCloser) io.Reade
435419
}
436420
}
437421

438-
// wrapWithPageBodyLimit decorates the visit callback to skip pages whose
439-
// storage body exceeds maxPageBodyBytes, logging a warning when that happens.
440-
func (c *httpConfluenceClient) wrapWithPageBodyLimit(
441-
visit func(*Page) error,
442-
) func(*Page) error {
443-
if c.maxPageBodyBytes == 0 {
444-
return visit
445-
}
446-
447-
return func(p *Page) error {
448-
if p.Body.Storage != nil {
449-
bodySize := int64(len(p.Body.Storage.Value))
450-
if bodySize > c.maxPageBodyBytes {
451-
log.Warn().
452-
Str("page_id", p.ID).
453-
Int64("body_bytes", bodySize).
454-
Msg("Skipping page content because the Confluence page body exceeded the configured size limit.")
455-
return nil
456-
}
457-
}
458-
return visit(p)
459-
}
460-
}
461-
462422
// handleStreamError centralizes handling for decode/close errors and decides
463423
// whether to stop, continue with a new cursor, or return an error.
464424
func (c *httpConfluenceClient) handleStreamError(
@@ -637,14 +597,14 @@ type PageBody struct {
637597

638598
// Page represents a Confluence page.
639599
type Page struct {
640-
ID string `json:"id"`
641-
Status string `json:"status"`
642-
Title string `json:"title"`
643-
SpaceID string `json:"spaceId"`
644-
Type string `json:"type"`
645-
Body PageBody `json:"body"`
646-
Links map[string]string `json:"_links"`
647-
Version PageVersion `json:"version"`
600+
ID string `json:"id"`
601+
Status string `json:"status"`
602+
Title string `json:"title"`
603+
SpaceID string `json:"spaceId"`
604+
Type string `json:"type"`
605+
Body PageBody `json:"body"`
606+
Links map[string]string `json:"_links"`
607+
Version PageVersion `json:"version"`
648608
}
649609

650610
// Space represents a Confluence space.

plugins/confluence_client_test.go

Lines changed: 0 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99
"net/http"
1010
"net/http/httptest"
1111
"net/url"
12-
"reflect"
1312
"strings"
1413
"testing"
1514
"time"
@@ -1480,39 +1479,6 @@ func TestWithMaxTotalScanBytes(t *testing.T) {
14801479
}
14811480
}
14821481

1483-
func TestWithMaxPageBodyBytes(t *testing.T) {
1484-
tests := []struct {
1485-
name string
1486-
input int64
1487-
want int64
1488-
}{
1489-
{
1490-
name: "zero disables limit",
1491-
input: 0,
1492-
want: 0,
1493-
},
1494-
{
1495-
name: "positive sets limit",
1496-
input: 1024,
1497-
want: 1024,
1498-
},
1499-
{
1500-
name: "negative treated as zero",
1501-
input: -42,
1502-
want: 0,
1503-
},
1504-
}
1505-
1506-
for _, tt := range tests {
1507-
t.Run(tt.name, func(t *testing.T) {
1508-
c := &httpConfluenceClient{}
1509-
WithMaxPageBodyBytes(tt.input)(c)
1510-
1511-
assert.Equal(t, tt.want, c.maxPageBodyBytes)
1512-
})
1513-
}
1514-
}
1515-
15161482
func TestCountingReader(t *testing.T) {
15171483
tests := []struct {
15181484
name string
@@ -1615,109 +1581,6 @@ func TestWrapWithCountingReader(t *testing.T) {
16151581
}
16161582
}
16171583

1618-
func TestWrapWithPageBodyLimit(t *testing.T) {
1619-
mkStorage := func(content string) *struct {
1620-
Value string `json:"value"`
1621-
} {
1622-
return &struct {
1623-
Value string `json:"value"`
1624-
}{Value: content}
1625-
}
1626-
1627-
tests := []struct {
1628-
name string
1629-
maxPageBodyBytes int64
1630-
page *Page
1631-
visitErr error
1632-
expectVisitCalled bool
1633-
expectedErr error
1634-
expectSameFunc bool
1635-
}{
1636-
{
1637-
name: "no limit uses original visitor",
1638-
maxPageBodyBytes: 0,
1639-
page: &Page{
1640-
ID: "P1",
1641-
Body: PageBody{
1642-
Storage: mkStorage("some content that can be any size"),
1643-
},
1644-
},
1645-
visitErr: assert.AnError,
1646-
expectVisitCalled: true,
1647-
expectedErr: assert.AnError,
1648-
expectSameFunc: true,
1649-
},
1650-
{
1651-
name: "body below limit calls visitor",
1652-
maxPageBodyBytes: 10,
1653-
page: &Page{
1654-
ID: "P2",
1655-
Body: PageBody{
1656-
Storage: mkStorage("small"), // len = 5
1657-
},
1658-
},
1659-
visitErr: assert.AnError,
1660-
expectVisitCalled: true,
1661-
expectedErr: assert.AnError,
1662-
expectSameFunc: false,
1663-
},
1664-
{
1665-
name: "body above limit skips visitor",
1666-
maxPageBodyBytes: 5,
1667-
page: &Page{
1668-
ID: "P3",
1669-
Body: PageBody{
1670-
Storage: mkStorage("too-large-body"), // len > 5
1671-
},
1672-
},
1673-
visitErr: assert.AnError, // should never be returned
1674-
expectVisitCalled: false,
1675-
expectedErr: nil,
1676-
expectSameFunc: false,
1677-
},
1678-
{
1679-
name: "nil storage calls visitor",
1680-
maxPageBodyBytes: 5,
1681-
page: &Page{
1682-
ID: "P4",
1683-
Body: PageBody{
1684-
Storage: nil,
1685-
},
1686-
},
1687-
visitErr: assert.AnError,
1688-
expectVisitCalled: true,
1689-
expectedErr: assert.AnError,
1690-
expectSameFunc: false,
1691-
},
1692-
}
1693-
1694-
for _, tc := range tests {
1695-
t.Run(tc.name, func(t *testing.T) {
1696-
c := &httpConfluenceClient{
1697-
maxPageBodyBytes: tc.maxPageBodyBytes,
1698-
}
1699-
1700-
visited := false
1701-
visitErr := tc.visitErr
1702-
1703-
visit := func(p *Page) error {
1704-
visited = true
1705-
return visitErr
1706-
}
1707-
1708-
wrapped := c.wrapWithPageBodyLimit(visit)
1709-
1710-
sameFunc := reflect.ValueOf(visit).Pointer() == reflect.ValueOf(wrapped).Pointer()
1711-
1712-
err := wrapped(tc.page)
1713-
1714-
assert.Equal(t, tc.expectSameFunc, sameFunc)
1715-
assert.Equal(t, tc.expectVisitCalled, visited)
1716-
assert.ErrorIs(t, err, tc.expectedErr)
1717-
})
1718-
}
1719-
}
1720-
17211584
func TestHandleStreamError(t *testing.T) {
17221585
base, _ := url.Parse("https://x.test/wiki/api/v2/pages?limit=10")
17231586

0 commit comments

Comments
 (0)