1+ #! /bin/bash
2+
3+ # DOCUMENTATION:
4+ # ./s3-replication-scanner.sh
5+ # Environment variable for replication status filter
6+ # - REPLICATION_STATUS_FILTER: Filter objects by status (default: "PENDING")
7+ # - Valid values: "PENDING", "FAILED", "COMPLETED", "" (empty for all)
8+ # - HOST: Metastore host endpoint (default: "localhost:9000")
9+ # - MAX_KEYS: Maximum keys per API request (default: 1000)
10+ # - BUCKET_LIST: Comma-separated bucket names to process (optional)
11+
12+
13+ REPLICATION_STATUS_FILTER=${REPLICATION_STATUS_FILTER:- " PENDING" }
14+
15+ HOST=${HOST:- " localhost:9000" }
16+ MAX_KEYS=${MAX_KEYS:- 1000}
17+ BUCKET_LIST=${BUCKET_LIST:- " " }
18+
19+ # Counters for summary
20+ BUCKETS_PROCESSED=0
21+ BUCKETS_FAILED=0
22+ OBJECTS_FOUND=0
23+
24+ echo " Filtering objects with replication status: '$REPLICATION_STATUS_FILTER '"
25+ echo " Using host: $HOST "
26+ echo " Max keys per request: $MAX_KEYS "
27+ if [ ! -z " $BUCKET_LIST " ]; then
28+ echo " Using provided bucket list: $BUCKET_LIST "
29+ fi
30+ echo " =================================="
31+
32+ # Get list of buckets - either from parameter or fetch all
33+ if [ ! -z " $BUCKET_LIST " ]; then
34+ # Convert comma-separated list to space-separated
35+ buckets=$( echo " $BUCKET_LIST " | tr ' ,' ' ' )
36+ echo " Using provided buckets: $buckets "
37+ else
38+ # Get list of all buckets
39+ echo " Fetching bucket list from metastore..."
40+ buckets=$( curl -f " $HOST /default/metastore" 2> /dev/null | jq -r ' .[].key' 2> /dev/null)
41+ curl_exit_code=$?
42+
43+ if [ $curl_exit_code -ne 0 ]; then
44+ echo " ERROR: Failed to connect to metastore at $HOST (curl exit code: $curl_exit_code )"
45+ echo " Partial results: Unable to fetch bucket list"
46+ exit 1
47+ fi
48+
49+ if [ -z " $buckets " ]; then
50+ echo " WARNING: No buckets found in metastore"
51+ exit 0
52+ fi
53+ echo " Found buckets from metastore"
54+ fi
55+
56+ # Function to check if bucket has replication enabled
57+ has_replication_enabled () {
58+ local bucket=$1
59+ # Remove db/ prefix if present
60+ local clean_bucket=" ${bucket# db/ } "
61+
62+ local replication_config=$( curl -f " $HOST /default/attributes/$clean_bucket " 2> /dev/null | jq ' .replicationConfiguration' 2> /dev/null)
63+ local curl_exit_code=$?
64+
65+ if [ $curl_exit_code -ne 0 ]; then
66+ echo " WARNING: Failed to fetch attributes for bucket $clean_bucket (curl exit code: $curl_exit_code )" >&2
67+ return 2 # error state
68+ fi
69+
70+ if [ " $replication_config " != " null" ] && [ " $replication_config " != " " ]; then
71+ return 0 # has replication config
72+ else
73+ return 1 # no replication config
74+ fi
75+ }
76+
77+ # Function to get objects with specific replication status (with pagination support)
78+ get_objects_with_status () {
79+ local bucket=$1
80+ local status_filter=$2
81+ local marker=$3
82+
83+ # Remove db/ prefix if present
84+ local clean_bucket=" ${bucket# db/ } "
85+
86+ # Build URL with pagination parameters
87+ local url=" $HOST /default/bucket/$clean_bucket ?maxKeys=$MAX_KEYS "
88+ if [ ! -z " $marker " ]; then
89+ url=" $url &marker=$marker "
90+ fi
91+
92+ echo " Fetching from: $url " >&2
93+
94+ # Get objects in the bucket with pagination
95+ local response=$( curl -f " $url " 2> /dev/null)
96+ local curl_exit_code=$?
97+
98+ if [ $curl_exit_code -ne 0 ]; then
99+ echo " ERROR: Failed to fetch objects from bucket $clean_bucket (curl exit code: $curl_exit_code )" >&2
100+ return 1
101+ fi
102+
103+ local objects=$( echo " $response " | jq -r ' .Contents[]?' 2> /dev/null)
104+ local is_truncated=$( echo " $response " | jq -r ' .IsTruncated // false' 2> /dev/null)
105+ local last_key=$( echo " $objects " | jq -r ' .key' | tail -n 1 2> /dev/null)
106+
107+ if [ -z " $objects " ]; then
108+ echo " No objects found in bucket $clean_bucket " >&2
109+ return 0
110+ fi
111+
112+ # Filter objects by replication status
113+ local filtered=$( echo " $objects " | jq -r --arg status " $status_filter " '
114+ select(.value | fromjson | .replicationInfo.status == $status) |
115+ (.key | split("\u0000")) as $key_parts |
116+ {
117+ bucket: "' $clean_bucket ' ",
118+ key: ($key_parts[0] // .key),
119+ versionId: ($key_parts[1] // ""),
120+ replicationStatus: (.value | fromjson | .replicationInfo.status)
121+ }
122+ ' 2> /dev/null)
123+
124+ # Output filtered results if any
125+ if [ ! -z " $filtered " ]; then
126+ echo " $filtered "
127+ fi
128+
129+ # Continue pagination if truncated
130+ if [ " $is_truncated " = " true" ] && [ ! -z " $last_key " ]; then
131+ echo " More results available, next marker: $last_key " >&2
132+ get_objects_with_status " $bucket " " $status_filter " " $last_key "
133+ return $?
134+ fi
135+
136+ return 0
137+ }
138+
139+ # Main logic
140+ echo " Buckets with replication enabled:"
141+ for bucket in $buckets ; do
142+ if has_replication_enabled " $bucket " ; then
143+ # Remove db/ prefix for display
144+ clean_bucket=" ${bucket# db/ } "
145+ echo " - $clean_bucket "
146+
147+ # Get objects with the specified replication status
148+ filtered_objects=$( get_objects_with_status " $bucket " " $REPLICATION_STATUS_FILTER " )
149+
150+ if [ ! -z " $filtered_objects " ]; then
151+ echo " Objects with replication status '$REPLICATION_STATUS_FILTER ':"
152+ echo " $filtered_objects " | jq -r '
153+ " Bucket: " + .bucket + " | Key: " + .key + " | VersionId: " + .versionId + " | Status: " + .replicationStatus
154+ '
155+ else
156+ echo " No objects with replication status '$REPLICATION_STATUS_FILTER '"
157+ fi
158+ echo " "
159+ fi
160+ done
0 commit comments