Skip to content

Commit 88d4933

Browse files
orishoshanomris94
andauthored
Fix rare issue concerning captures going stale on the AWS VPC CNI, as well as auto-resolve addresses with no currently seen DNS traffic in the cluster (#242)
Co-authored-by: omri.s <omri.s@otterize.com>
1 parent 35157de commit 88d4933

File tree

13 files changed

+305
-146
lines changed

13 files changed

+305
-146
lines changed

src/go.mod

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/go.sum

Lines changed: 0 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/mapper/pkg/clouduploader/cloud_upload.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func (c *CloudUploader) NotifyIntents(ctx context.Context, intents []intentsstor
6363
toCloud.Intent.ServerAlias = &cloudclient.ServerAliasInput{Name: intent.Intent.Server.KubernetesService, Kind: lo.ToPtr(serviceidentity.KindService)}
6464
}
6565
// debug log all the fields of intent input one by one with their values
66-
logrus.Debugf("intent CleintName: %s\t Namespace: %s\t ServerName: %s\t ServerNamespace: %s\t ClientWorkloadKind: %s\t ServerWorkloadKind: %s\t ServerAlias: %v", lo.FromPtr(toCloud.Intent.ClientName), lo.FromPtr(toCloud.Intent.Namespace), lo.FromPtr(toCloud.Intent.ServerName), lo.FromPtr(toCloud.Intent.ServerNamespace), lo.FromPtr(toCloud.Intent.ClientWorkloadKind), lo.FromPtr(toCloud.Intent.ServerWorkloadKind), lo.FromPtr(toCloud.Intent.ServerAlias))
66+
logrus.Debugf("intent ClientName: %s\t Namespace: %s\t ServerName: %s\t ServerNamespace: %s\t ClientWorkloadKind: %s\t ServerWorkloadKind: %s\t ServerAlias: %v", lo.FromPtr(toCloud.Intent.ClientName), lo.FromPtr(toCloud.Intent.Namespace), lo.FromPtr(toCloud.Intent.ServerName), lo.FromPtr(toCloud.Intent.ServerNamespace), lo.FromPtr(toCloud.Intent.ClientWorkloadKind), lo.FromPtr(toCloud.Intent.ServerWorkloadKind), lo.FromPtr(toCloud.Intent.ServerAlias))
6767

6868
return toCloud
6969
})

src/mapper/pkg/config/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ const (
3131
DNSCacheItemsMaxCapacityKey = "dns-cache-items-max-capacity"
3232
DNSCacheItemsMaxCapacityDefault = 100000
3333
DNSClientIntentsUpdateIntervalKey = "dns-client-intents-update-interval"
34-
DNSClientIntentsUpdateIntervalDefault = 1 * time.Second
34+
DNSClientIntentsUpdateIntervalDefault = 100 * time.Millisecond
3535
DNSClientIntentsUpdateEnabledKey = "dns-client-intents-update-enabled"
3636
DNSClientIntentsUpdateEnabledDefault = true
3737
ServiceCacheTTLDurationKey = "service-cache-ttl-duration"

src/mapper/pkg/dnscache/dns_cache.go

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,45 +2,44 @@ package dnscache
22

33
import (
44
"context"
5-
"github.com/jellydator/ttlcache/v3"
65
"github.com/otterize/network-mapper/src/mapper/pkg/config"
76
"github.com/sirupsen/logrus"
87
"github.com/spf13/viper"
8+
"net"
99
"time"
1010
)
1111

1212
type DNSCache struct {
13-
cache *ttlcache.Cache[string, string]
13+
cache *TTLCache[string, string]
14+
}
15+
16+
type Resolver interface {
17+
LookupIPAddr(ctx context.Context, host string) ([]net.IPAddr, error)
1418
}
1519

1620
func NewDNSCache() *DNSCache {
1721
capacity := viper.GetInt(config.DNSCacheItemsMaxCapacityKey)
18-
dnsRecordCache := ttlcache.New[string, string](ttlcache.WithCapacity[string, string](uint64(capacity)))
19-
go dnsRecordCache.Start()
20-
21-
lastCapacityReachedErrorPrint := time.Time{}
22-
dnsRecordCache.OnEviction(func(ctx context.Context, reason ttlcache.EvictionReason, item *ttlcache.Item[string, string]) {
23-
if reason == ttlcache.EvictionReasonCapacityReached && time.Since(lastCapacityReachedErrorPrint) > time.Minute {
24-
logrus.Warningf("DNS cache capacity reached entries are being dropped, consider increasing config '%s'",
25-
config.DNSCacheItemsMaxCapacityKey)
26-
lastCapacityReachedErrorPrint = time.Now()
27-
}
28-
})
22+
if capacity == 0 {
23+
logrus.Panic("Capacity cannot be 0")
24+
}
25+
dnsRecordCache := NewTTLCache[string, string](capacity)
2926

3027
return &DNSCache{
3128
cache: dnsRecordCache,
3229
}
3330
}
3431

35-
func (d *DNSCache) AddOrUpdateDNSData(dnsName string, ip string, ttlSeconds int) {
36-
ttl := time.Duration(ttlSeconds) * time.Second
37-
d.cache.Set(dnsName, ip, ttl)
32+
func (d *DNSCache) AddOrUpdateDNSData(dnsName string, ip string, ttl time.Duration) {
33+
d.cache.Insert(dnsName, ip, ttl)
3834
}
3935

40-
func (d *DNSCache) GetResolvedIP(dnsName string) (string, bool) {
36+
func (d *DNSCache) GetResolvedIPs(dnsName string) []string {
4137
entry := d.cache.Get(dnsName)
42-
if entry == nil {
43-
return "", false
44-
}
45-
return entry.Value(), true
38+
return entry
39+
}
40+
41+
// CacheValue holds the value and its expiration time
42+
type CacheValue[V any] struct {
43+
Value V
44+
Expiration time.Time
4645
}

src/mapper/pkg/dnscache/dns_cache_test.go

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,28 +19,29 @@ type DNSCacheTestSuite struct {
1919
}
2020

2121
func (s *DNSCacheTestSuite) TearDownTest() {
22-
viper.Reset()
22+
viper.Set(config.DNSCacheItemsMaxCapacityKey, config.DNSCacheItemsMaxCapacityDefault)
2323
}
2424

2525
func (s *DNSCacheTestSuite) TestDNSCache() {
2626
cache := NewDNSCache()
27-
cache.AddOrUpdateDNSData("good-news.com", IP1, 60)
28-
ip, found := cache.GetResolvedIP("good-news.com")
29-
s.Require().True(found)
30-
s.Require().Equal(IP1, ip)
31-
32-
cache.AddOrUpdateDNSData("good-news.com", IP2, 60)
33-
ip, found = cache.GetResolvedIP("good-news.com")
34-
s.Require().True(found)
35-
s.Require().Equal(IP2, ip)
36-
37-
_, found = cache.GetResolvedIP("bad-news.de")
38-
s.Require().False(found)
39-
40-
cache.AddOrUpdateDNSData("bad-news.de", IP1, 60)
41-
ip, found = cache.GetResolvedIP("bad-news.de")
42-
s.Require().True(found)
43-
s.Require().Equal(IP1, ip)
27+
cache.AddOrUpdateDNSData("good-news.com", IP1, 60*time.Second)
28+
ips := cache.GetResolvedIPs("good-news.com")
29+
s.Require().Len(ips, 1)
30+
s.Require().Equal(IP1, ips[0])
31+
32+
cache.AddOrUpdateDNSData("good-news.com", IP2, 60*time.Second)
33+
ips = cache.GetResolvedIPs("good-news.com")
34+
s.Require().Len(ips, 2)
35+
s.Require().Contains(ips, IP1)
36+
s.Require().Contains(ips, IP2)
37+
38+
ips = cache.GetResolvedIPs("bad-news.de")
39+
s.Require().Len(ips, 0)
40+
41+
cache.AddOrUpdateDNSData("bad-news.de", IP1, 60*time.Second)
42+
ips = cache.GetResolvedIPs("bad-news.de")
43+
s.Require().Len(ips, 1)
44+
s.Require().Equal(IP1, ips[0])
4445
}
4546

4647
func (s *DNSCacheTestSuite) TestCapacityConfig() {
@@ -50,32 +51,35 @@ func (s *DNSCacheTestSuite) TestCapacityConfig() {
5051
names := make([]string, 0)
5152
for i := 0; i < capacityLimit+1; i++ {
5253
dnsName := fmt.Sprintf("dns-%d.com", i)
53-
cache.AddOrUpdateDNSData(dnsName, IP1, 60)
54+
cache.AddOrUpdateDNSData(dnsName, IP1, 60*time.Second)
5455
names = append(names, dnsName)
5556
}
5657

5758
for i, dnsName := range names {
58-
_, found := cache.GetResolvedIP(dnsName)
59+
vals := cache.GetResolvedIPs(dnsName)
5960
if i == 0 {
60-
s.Require().False(found)
61+
s.Require().Len(vals, 0)
6162
} else {
62-
s.Require().True(found)
63+
s.Require().Len(vals, 1)
6364
}
6465
}
6566
}
6667

6768
func (s *DNSCacheTestSuite) TestTTL() {
6869
cache := NewDNSCache()
6970

70-
cache.AddOrUpdateDNSData("my-future-blog.de", IP1, 1)
71-
ip, found := cache.GetResolvedIP("my-future-blog.de")
72-
s.Require().True(found)
73-
s.Require().Equal(IP1, ip)
71+
cache.AddOrUpdateDNSData("my-future-blog.de", IP1, 1*time.Second)
72+
ips := cache.GetResolvedIPs("my-future-blog.de")
73+
s.Require().Len(ips, 1)
74+
s.Require().Equal(IP1, ips[0])
7475

7576
// This is the only place where we sleep in the test, to make sure the TTL works as expected
76-
time.Sleep(1100 * time.Millisecond)
77-
_, found = cache.GetResolvedIP("my-future-blog.de")
78-
s.Require().False(found)
77+
time.Sleep(2 * time.Second)
78+
79+
cache.cache.cleanupExpired()
80+
81+
ips = cache.GetResolvedIPs("my-future-blog.de")
82+
s.Require().Len(ips, 0)
7983

8084
}
8185

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
package dnscache
2+
3+
import (
4+
"container/list"
5+
"sync"
6+
"time"
7+
)
8+
9+
// CacheEntry represents an entry in the cache, linking the key with its list element for LRU
10+
type CacheEntry[K comparable, V comparable] struct {
11+
Key K
12+
Value CacheValue[V]
13+
}
14+
15+
// TTLCache is a generic TTL cache that stores unique items with individual TTLs and LRU eviction
16+
type TTLCache[K comparable, V comparable] struct {
17+
items map[K]map[V]*list.Element // Key to map of values, each value points to an LRU element
18+
lru *list.List // List for LRU eviction, stores CacheEntry[K, V]
19+
maxSize int // Maximum size of the cache
20+
mu sync.Mutex
21+
cleanupCh chan struct{}
22+
}
23+
24+
// NewTTLCache creates a new TTL cache with the specified maxSize
25+
func NewTTLCache[K comparable, V comparable](maxSize int) *TTLCache[K, V] {
26+
cache := &TTLCache[K, V]{
27+
items: make(map[K]map[V]*list.Element),
28+
lru: list.New(),
29+
maxSize: maxSize,
30+
cleanupCh: make(chan struct{}),
31+
}
32+
33+
// Start the cleanup process
34+
go cache.startCleanup()
35+
36+
return cache
37+
}
38+
39+
// Insert adds a unique value to the cache under the specified key with its own TTL
40+
// and manages the LRU eviction when the cache exceeds the max size.
41+
func (c *TTLCache[K, V]) Insert(key K, value V, ttl time.Duration) {
42+
c.mu.Lock()
43+
defer c.mu.Unlock()
44+
45+
// If the key doesn't exist, create an entry for it
46+
if _, exists := c.items[key]; !exists {
47+
c.items[key] = make(map[V]*list.Element)
48+
}
49+
50+
// Check if the value already exists under this key and remove it from LRU if so
51+
if elem, exists := c.items[key][value]; exists {
52+
c.lru.Remove(elem)
53+
}
54+
55+
// Insert or update the value with its expiration time and add it to the LRU list
56+
cacheEntry := CacheEntry[K, V]{Key: key, Value: CacheValue[V]{Value: value, Expiration: time.Now().Add(ttl)}}
57+
lruElem := c.lru.PushFront(cacheEntry)
58+
c.items[key][value] = lruElem
59+
60+
// Manage the cache size, evict the least recently used item if needed
61+
if c.lru.Len() > c.maxSize {
62+
c.evict()
63+
}
64+
65+
}
66+
67+
// evict removes the least recently used item from the cache
68+
func (c *TTLCache[K, V]) evict() {
69+
// Remove the least recently used item (which is at the back of the LRU list)
70+
lruElem := c.lru.Back()
71+
if lruElem == nil {
72+
return
73+
}
74+
75+
cacheEntry := lruElem.Value.(CacheEntry[K, V])
76+
key, value := cacheEntry.Key, cacheEntry.Value
77+
78+
// Remove the value from the cache
79+
if _, exists := c.items[key]; exists {
80+
delete(c.items[key], value.Value)
81+
82+
// If no more values exist under this key, remove the key itself
83+
if len(c.items[key]) == 0 {
84+
delete(c.items, key)
85+
}
86+
}
87+
88+
// Remove from the LRU list
89+
c.lru.Remove(lruElem)
90+
}
91+
92+
// Get retrieves the values for a specific key and removes any expired values
93+
// Returns a slice of valid values for the given key
94+
func (c *TTLCache[K, V]) Get(key K) []V {
95+
c.mu.Lock()
96+
defer c.mu.Unlock()
97+
98+
// Check if the key exists
99+
if _, exists := c.items[key]; !exists {
100+
return make([]V, 0)
101+
}
102+
103+
// Filter out expired values and prepare the result
104+
var result []V
105+
for value, lruElem := range c.items[key] {
106+
cacheEntry := lruElem.Value.(CacheEntry[K, V])
107+
108+
// If the value has expired, remove it
109+
if time.Now().After(c.lruValueExpiration(lruElem)) {
110+
c.lru.Remove(lruElem)
111+
delete(c.items[key], value)
112+
continue
113+
}
114+
115+
// Add valid values to the result
116+
result = append(result, cacheEntry.Value.Value)
117+
118+
// Move the accessed item to the front of the LRU list (mark as recently used)
119+
c.lru.MoveToFront(lruElem)
120+
}
121+
122+
// If all values are expired, remove the key entirely
123+
if len(c.items[key]) == 0 {
124+
delete(c.items, key)
125+
}
126+
127+
return result
128+
}
129+
130+
// cleanupExpired removes expired values from the cache
131+
func (c *TTLCache[K, V]) cleanupExpired() {
132+
for key, values := range c.items {
133+
for value, lruElem := range values {
134+
// If a value has expired, remove it
135+
if time.Now().After(c.lruValueExpiration(lruElem)) {
136+
c.lru.Remove(lruElem)
137+
delete(values, value)
138+
}
139+
}
140+
141+
// If all values are expired, remove the key entirely
142+
if len(values) == 0 {
143+
delete(c.items, key)
144+
}
145+
}
146+
}
147+
148+
// lruValueExpiration gets the expiration time for a given LRU element
149+
func (c *TTLCache[K, V]) lruValueExpiration(elem *list.Element) time.Time {
150+
cacheEntry := elem.Value.(CacheEntry[K, V])
151+
return cacheEntry.Value.Expiration
152+
}
153+
154+
// startCleanup periodically cleans up expired items
155+
func (c *TTLCache[K, V]) startCleanup() {
156+
ticker := time.NewTicker(1 * time.Minute) // Cleanup interval
157+
defer ticker.Stop()
158+
159+
for {
160+
select {
161+
case <-ticker.C:
162+
c.mu.Lock()
163+
c.cleanupExpired()
164+
c.mu.Unlock()
165+
case <-c.cleanupCh:
166+
return
167+
}
168+
}
169+
}
170+
171+
// Stop stops the cache cleanup process
172+
func (c *TTLCache[K, V]) Stop() {
173+
close(c.cleanupCh)
174+
}

0 commit comments

Comments
 (0)