Skip to content

Commit 710ea4b

Browse files
authored
Improve the way we resolve services DNS to identities by going directly from the targetRef of endpoints instead of going through IP addresses (#229)
1 parent d63474e commit 710ea4b

File tree

4 files changed

+57
-43
lines changed

4 files changed

+57
-43
lines changed

.github/workflows/lint.yml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@ permissions:
1313
# pull-requests: read
1414

1515
jobs:
16-
golangci:
17-
name: golangci-lint
18-
runs-on: ubuntu-20.04
16+
vet:
17+
# run vet in a separate job to avoid conflicts with golangci-lint pkg-cache
18+
name: vet
19+
runs-on: ubuntu-latest
1920
steps:
2021
- uses: actions/setup-go@v3
2122
with:
@@ -31,6 +32,16 @@ jobs:
3132
working-directory: src/
3233
- name: check git diff
3334
run: git diff --exit-code
35+
golangci:
36+
name: golangci-lint
37+
runs-on: ubuntu-20.04
38+
steps:
39+
- uses: actions/setup-go@v3
40+
with:
41+
go-version: '1.21.3'
42+
- uses: actions/checkout@v3
43+
- name: Install dependencies
44+
run: sudo apt update && sudo apt install libpcap-dev # required for the linter to be able to lint github.com/google/gopacket
3445
- name: golangci-lint
3546
uses: golangci/golangci-lint-action@v3
3647
with:
@@ -41,7 +52,7 @@ jobs:
4152
working-directory: src
4253

4354
# Optional: golangci-lint command line arguments.
44-
args: --timeout 5m
55+
args: --timeout 5m --out-format github-actions
4556

4657
# Optional: show only new issues if it's a pull request. The default value is `false`.
4758
# only-new-issues: true

src/mapper/pkg/kubefinder/kubefinder.go

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ func (k *KubeFinder) ResolveIstioWorkloadToPod(ctx context.Context, workload str
346346
return &podList.Items[0], nil
347347
}
348348

349-
func (k *KubeFinder) ResolveServiceAddressToIps(ctx context.Context, fqdn string) ([]string, types.NamespacedName, error) {
349+
func (k *KubeFinder) ResolveServiceAddressToPods(ctx context.Context, fqdn string) ([]corev1.Pod, types.NamespacedName, error) {
350350
clusterDomain := viper.GetString(config.ClusterDomainKey)
351351
if !strings.HasSuffix(fqdn, clusterDomain) {
352352
return nil, types.NamespacedName{}, errors.Errorf("address %s is not in the cluster", fqdn)
@@ -366,22 +366,28 @@ func (k *KubeFinder) ResolveServiceAddressToIps(ctx context.Context, fqdn string
366366
}
367367
namespace := fqdnWithoutClusterDomainParts[len(fqdnWithoutClusterDomainParts)-2]
368368
serviceName := fqdnWithoutClusterDomainParts[len(fqdnWithoutClusterDomainParts)-3]
369-
endpoints := &corev1.Endpoints{}
369+
service := &corev1.Service{}
370370
serviceNamespacedName := types.NamespacedName{Name: serviceName, Namespace: namespace}
371-
err := k.client.Get(ctx, serviceNamespacedName, endpoints)
371+
err := k.client.Get(ctx, serviceNamespacedName, service)
372372
if err != nil {
373373
return nil, types.NamespacedName{}, errors.Wrap(err)
374374
}
375-
ips := make([]string, 0)
376-
for _, subset := range endpoints.Subsets {
377-
for _, address := range subset.Addresses {
378-
ips = append(ips, address.IP)
379-
}
375+
pods, err := k.ResolveServiceToPods(ctx, service)
376+
if err != nil {
377+
return nil, types.NamespacedName{}, errors.Wrap(err)
380378
}
381-
return ips, serviceNamespacedName, nil
379+
380+
return pods, serviceNamespacedName, nil
382381
case "pod":
383382
// for address format of pods: 172-17-0-3.default.pod.cluster.local
384-
return []string{strings.ReplaceAll(fqdnWithoutClusterDomainParts[0], "-", ".")}, types.NamespacedName{}, nil
383+
ip := strings.ReplaceAll(fqdnWithoutClusterDomainParts[0], "-", ".")
384+
pod, err := k.ResolveIPToPod(ctx, ip)
385+
if err != nil {
386+
return make([]corev1.Pod, 0), types.NamespacedName{}, errors.Wrap(err)
387+
}
388+
389+
return []corev1.Pod{*pod}, types.NamespacedName{}, nil
390+
385391
default:
386392
return nil, types.NamespacedName{}, errors.Errorf("cannot resolve k8s address %s, type %s not supported", fqdn, fqdnWithoutClusterDomainParts[len(fqdnWithoutClusterDomainParts)-1])
387393
}

src/mapper/pkg/kubefinder/kubefinder_test.go

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"github.com/otterize/network-mapper/src/shared/testbase"
7+
"github.com/samber/lo"
78
"github.com/stretchr/testify/suite"
89
corev1 "k8s.io/api/core/v1"
910
"testing"
@@ -38,35 +39,40 @@ func (s *KubeFinderTestSuite) TestResolveIpToPod() {
3839
}
3940

4041
func (s *KubeFinderTestSuite) TestResolveServiceAddressToIps() {
41-
_, _, err := s.kubeFinder.ResolveServiceAddressToIps(context.Background(), "www.google.com")
42+
_, _, err := s.kubeFinder.ResolveServiceAddressToPods(context.Background(), "www.google.com")
4243
s.Require().Error(err)
4344

44-
_, _, err = s.kubeFinder.ResolveServiceAddressToIps(context.Background(), fmt.Sprintf("svc-service1.%s.svc.cluster.local", s.TestNamespace))
45+
_, _, err = s.kubeFinder.ResolveServiceAddressToPods(context.Background(), fmt.Sprintf("svc-service1.%s.svc.cluster.local", s.TestNamespace))
4546
s.Require().Error(err)
4647

4748
podIp0 := "1.1.1.1"
4849
podIp1 := "1.1.1.2"
4950
podIp2 := "1.1.1.3"
5051
s.Require().NoError(s.Mgr.GetClient().List(context.Background(), &corev1.EndpointsList{})) // Workaround: make then client start caching Endpoints, so when we do "WaitForCacheSync" it will actually sync cache"
5152
s.AddDeploymentWithService("service0", []string{podIp0}, map[string]string{"app": "service0"}, "10.0.0.10")
52-
s.AddDeploymentWithService("service1", []string{podIp1, podIp2}, map[string]string{"app": "service1"}, "10.0.0.11")
53+
_, _, retPods := s.AddDeploymentWithService("service1", []string{podIp1, podIp2}, map[string]string{"app": "service1"}, "10.0.0.11")
5354
s.Require().True(s.Mgr.GetCache().WaitForCacheSync(context.Background()))
5455

55-
ips, service, err := s.kubeFinder.ResolveServiceAddressToIps(context.Background(), fmt.Sprintf("svc-service1.%s.svc.cluster.local", s.TestNamespace))
56+
pods, service, err := s.kubeFinder.ResolveServiceAddressToPods(context.Background(), fmt.Sprintf("svc-service1.%s.svc.cluster.local", s.TestNamespace))
5657
s.Require().NoError(err)
5758
s.Require().Equal("svc-service1", service.Name)
58-
s.Require().ElementsMatch(ips, []string{podIp1, podIp2})
59+
s.Require().ElementsMatch(lo.Map(pods, func(p corev1.Pod, _ int) string { return p.Status.PodIP }), lo.Map(retPods, func(p *corev1.Pod, _ int) string { return p.Status.PodIP }))
5960

6061
// make sure we don't fail on the longer forms of k8s service addresses, listed on this page: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service
61-
ips, service, err = s.kubeFinder.ResolveServiceAddressToIps(context.Background(), fmt.Sprintf("4-4-4-4.svc-service1.%s.svc.cluster.local", s.TestNamespace))
62+
pods, service, err = s.kubeFinder.ResolveServiceAddressToPods(context.Background(), fmt.Sprintf("4-4-4-4.svc-service1.%s.svc.cluster.local", s.TestNamespace))
6263
s.Require().Equal("svc-service1", service.Name)
6364
s.Require().NoError(err)
64-
s.Require().ElementsMatch(ips, []string{podIp1, podIp2})
65+
s.Require().ElementsMatch(lo.Map(pods, func(p corev1.Pod, _ int) string { return p.Status.PodIP }), lo.Map(retPods, func(p *corev1.Pod, _ int) string { return p.Status.PodIP }))
6566

66-
ips, service, err = s.kubeFinder.ResolveServiceAddressToIps(context.Background(), fmt.Sprintf("4-4-4-4.%s.pod.cluster.local", s.TestNamespace))
67+
_, _, err = s.kubeFinder.ResolveServiceAddressToPods(context.Background(), fmt.Sprintf("4-4-4-4.%s.pod.cluster.local", s.TestNamespace))
68+
s.Require().Error(err)
69+
s.Require().True(s.Mgr.GetCache().WaitForCacheSync(context.Background()))
70+
71+
_, pods4444 := s.AddDeployment("depl", []string{"4.4.4.4"}, map[string]string{"app": "4444"})
72+
pods, service, err = s.kubeFinder.ResolveServiceAddressToPods(context.Background(), fmt.Sprintf("4-4-4-4.%s.pod.cluster.local", s.TestNamespace))
6773
s.Require().NoError(err)
6874
s.Require().Empty(service)
69-
s.Require().ElementsMatch(ips, []string{"4.4.4.4"})
75+
s.Require().ElementsMatch(lo.Map(pods, func(p corev1.Pod, _ int) string { return p.Status.PodIP }), lo.Map(pods4444, func(p *corev1.Pod, _ int) string { return p.Status.PodIP }))
7076
}
7177

7278
func TestKubeFinderTestSuite(t *testing.T) {

src/mapper/pkg/resolvers/schema.helpers.resolvers.go

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ func (r *Resolver) handleDNSCaptureResultsAsKubernetesPods(ctx context.Context,
268268

269269
func (r *Resolver) resolveOtterizeIdentityForDestinationAddress(ctx context.Context, dest model.Destination) (*model.OtterizeServiceIdentity, bool, error) {
270270
destAddress := dest.Destination
271-
ips, serviceName, err := r.kubeFinder.ResolveServiceAddressToIps(ctx, destAddress)
271+
pods, serviceName, err := r.kubeFinder.ResolveServiceAddressToPods(ctx, destAddress)
272272
if err != nil {
273273
logrus.WithError(err).Warningf("Could not resolve service address %s", destAddress)
274274
// Intentionally no error return
@@ -282,30 +282,21 @@ func (r *Resolver) resolveOtterizeIdentityForDestinationAddress(ctx context.Cont
282282
}, true, nil
283283
}
284284

285-
if len(ips) == 0 {
286-
logrus.Debugf("Service address %s is currently not backed by any pod, ignoring", destAddress)
287-
return nil, false, nil
288-
}
289-
// Resolving the IP of the service's endpoints!
290-
destPod, err := r.kubeFinder.ResolveIPToPod(ctx, ips[0])
291-
if err != nil {
292-
if errors.Is(err, kubefinder.ErrFoundMoreThanOnePod) {
293-
logrus.WithError(err).Debugf("Ip %s belongs to more than one pod, ignoring", ips[0])
294-
} else {
295-
logrus.WithError(err).Debugf("Could not resolve %s to pod", ips[0])
285+
filteredPods := lo.Filter(pods, func(pod corev1.Pod, _ int) bool {
286+
lastCreationTimeForUsToTrustIt := dest.LastSeen
287+
if lo.IsEmpty(serviceName) {
288+
// In this case the DNS was a "pod" DNS - which contains IP - and therefore less reliable.
289+
lastCreationTimeForUsToTrustIt = lastCreationTimeForUsToTrustIt.Add(viper.GetDuration(config.TimeServerHasToLiveBeforeWeTrustItKey))
296290
}
297-
return nil, false, nil
298-
}
291+
return lastCreationTimeForUsToTrustIt.After(pod.CreationTimestamp.Time) && pod.DeletionTimestamp == nil
292+
})
299293

300-
if destPod.CreationTimestamp.After(dest.LastSeen) {
301-
logrus.Debugf("Pod %s was created after capture time %s, ignoring", destPod.Name, dest.LastSeen)
294+
if len(filteredPods) == 0 {
295+
logrus.Debugf("Service address %s is currently not backed by any valid pod, ignoring", destAddress)
302296
return nil, false, nil
303297
}
304298

305-
if destPod.DeletionTimestamp != nil {
306-
logrus.Debugf("Pod %s is being deleted, ignoring", destPod.Name)
307-
return nil, false, nil
308-
}
299+
destPod := &filteredPods[0]
309300

310301
dstService, err := r.serviceIdResolver.ResolvePodToServiceIdentity(ctx, destPod)
311302
if err != nil {

0 commit comments

Comments
 (0)