Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
70 changes: 61 additions & 9 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,38 @@
version: 2
version: 2.1

jobs:

build:
environment:
_JAVA_OPTIONS: "-Xms512m -Xmx1024m"
_JAVA_OPTIONS: "-Xms512m -Xmx1g"
TESTCONTAINERS_RYUK_DISABLED: "true"
working_directory: ~/workspace
docker:
- image: circleci/openjdk:8-jdk
- image: cimg/openjdk:17.0
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- restore_cache:
key: dependency-cache-{{ checksum "build.gradle" }}
- run:
command: ./gradlew -PmaxParallelForks=1 clean build
command: ./gradlew --no-daemon clean javadoc
- run:
# run static analysis tasks standalone to avoid OOME in CircleCI
command: ./gradlew --max-workers=1 --no-daemon analyze
- run:
no_output_timeout: 15m
command: |
export NETWORK_NAME="test_containers_network"

# Create a shared Docker network, required for communication between the CircleCI job container
# and TestContainers services.
docker network create $NETWORK_NAME

# Extract Docker host for helping CircleCI find TestContainer services.
export CONTAINER_HOST=$(docker network inspect "$NETWORK_NAME" --format='{{(index .IPAM.Config 0).Gateway}}')

./gradlew --no-daemon -PmaxParallelForks=1 build
- save_cache:
key: dependency-cache-{{ checksum "build.gradle" }}
paths:
Expand All @@ -29,30 +48,63 @@ jobs:
- store_artifacts:
path: build/libs

integration-test:
environment:
_JAVA_OPTIONS: "-Xms512m -Xmx1g"
working_directory: ~/workspace
docker:
- image: cimg/openjdk:17.0
steps:
- checkout
- restore_cache:
key: dependency-cache-{{ checksum "build.gradle" }}
- run:
command: ./gradlew --no-daemon -PmaxParallelForks=1 clean integrationTest
- save_cache:
key: dependency-cache-{{ checksum "build.gradle" }}
paths:
- ~/.gradle
- run:
command: mkdir ~/test-results
- run:
command: find ~/workspace -type f -regex ".*/test-results/.*xml" -exec ln {} ~/test-results/ \;
- store_test_results:
path: ~/test-results
- store_artifacts:
path: ~/test-results

publish:
working_directory: ~/workspace
docker:
- image: circleci/openjdk:8-jdk
- image: cimg/openjdk:17.0
steps:
- checkout
- restore_cache:
key: dependency-cache-{{ checksum "build.gradle" }}
- run:
command: ./gradlew distributeBuild
command: ./gradlew :artifactoryPublish :cruise-control:artifactoryPublish :cruise-control-core:artifactoryPublish :cruise-control-metrics-reporter:artifactoryPublish

workflows:
version: 2
version: 2.1
build-and-publish:
jobs:
- build:
filters:
tags:
only: /.*/
- publish:
- integration-test:
requires:
- build
filters:
branches:
ignore: /.*/
tags:
only: /^[0-9]+\.[0-9]+\.[0-9]+$/
only: /^[0-9]+\.[0-9]+\.[0-9]+(?:-[a-zA-Z0-9_]+)?$/
- publish:
requires:
- integration-test
filters:
branches:
ignore: /.*/
tags:
only: /^[0-9]+\.[0-9]+\.[0-9]+(?:-[a-zA-Z0-9_]+)?$/
25 changes: 25 additions & 0 deletions .github/workflows/artifactory.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Artifactory

on:
workflow_dispatch: # manual trigger
#release:
# types: [published]

jobs:
publish:
# if: startsWith(github.event.ref, 'release/')
name: publish
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # so gradle doesn't fail traversing the history
- uses: actions/setup-java@v4
with:
java-version: 17
distribution: microsoft
cache: gradle
- uses: gradle/actions/setup-gradle@v4 # v4.0.0
- name: publish
run: |
./gradlew :artifactoryPublish :cruise-control:artifactoryPublish :cruise-control-core:artifactoryPublish :cruise-control-metrics-reporter:artifactoryPublish
81 changes: 81 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: CI

on:
push:
branches: ['main']
pull_request:
types: [ opened, synchronize, reopened ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
test:
name: "test with JDK=${{matrix.java-dist}}:${{matrix.java-ver}}"
runs-on: [ubuntu-latest]
strategy:
fail-fast: false
matrix:
java-ver: [17]
java-dist: ['microsoft', 'temurin']
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # so gradle doesn't fail traversing the history
- uses: actions/setup-java@v4
with:
java-version: ${{ matrix.java-ver }}
distribution: ${{ matrix.java-dist }}
cache: gradle
# see: https://github.com/gradle/actions/blob/main/setup-gradle/README.md
- uses: gradle/actions/setup-gradle@v4 # v4.0.0
- name: gradle build
run: ./gradlew --no-daemon -PmaxParallelForks=1 build

integration-test:
name: "integration-test with JDK=${{matrix.java-dist}}:${{matrix.java-ver}}"
runs-on: [ubuntu-latest]
strategy:
fail-fast: false
matrix:
java-ver: [17]
java-dist: ['microsoft', 'temurin']
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # so gradle doesn't fail traversing the history
- uses: actions/setup-java@v4
with:
java-version: ${{ matrix.java-ver }}
distribution: ${{ matrix.java-dist }}
cache: gradle
# see: https://github.com/gradle/actions/blob/main/setup-gradle/README.md
- uses: gradle/actions/setup-gradle@v4 # v4.0.0
- name: gradle integration test
run: ./gradlew --no-daemon -PmaxParallelForks=1 clean integrationTest

build-platform:
name: platform build with JDK=${{matrix.java-dist}}:${{matrix.java-ver}} on ${{matrix.hw_platform}}
strategy:
fail-fast: false
matrix:
java-ver: [17]
java-dist: ['temurin']
hw_platform: ['s390x', 'ppc64le']
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # so gradle doesn't fail traversing the history
- continue-on-error: true
run: |
# install required qemu libraries
docker run --rm --privileged tonistiigi/binfmt:latest --install all
# run docker container with qemu emulation
docker run --rm \
--platform ${{ matrix.hw_platform }} \
--name qemu-cross-${{ matrix.hw_platform }} \
--mount type=bind,source=${PWD},target=/workspace \
--workdir /workspace \
${{matrix.hw_platform}}/eclipse-temurin:${{matrix.java-ver}}-jdk /bin/sh -c "uname -a; ./gradlew --no-daemon -PmaxParallelForks=1 build"
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,13 @@ out/
.reviewboardrc
logs
*~

target/
access.log
*.egg
/bin/
.vertx
**openapi.json
fileStore/failedBrokers.txt
cruise-control/src/main/resources/webroot/.openapi-generator-ignore
cruise-control/src/main/resources/webroot/.openapi-generator/
cruise-control/src/main/resources/webroot/README.md
19 changes: 14 additions & 5 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,20 @@ Please do not file reports on Github for security issues.
Please review the guidelines on at
https://www.linkedin.com/help/linkedin/answer/62924/security-vulnerabilities?lang=en

Tips for Getting Your Pull Request Accepted
Tips for Getting Your Pull Request (PR) Accepted
===========================================

1. Make sure all new features are tested and the tests pass.
1. Make sure all new features are tested and the tests pass -- i.e. a submitted PR should have already been tested for
existing and new unit tests.
2. Bug fixes must include a test case demonstrating the error that it fixes.
3. Open an issue first and seek advice for your change before submitting
a pull request. Large features which have never been discussed are
unlikely to be accepted.
3. Open an issue first and seek advice for your change before submitting a PR. Large features which have never been
discussed are unlikely to be accepted.
4. New contributors should create an account in CircleCI first before raising the PR.
5. Do not create a PR with "work-in-progress" (WIP) changes.
6. Use clear and concise titles for submitted PRs and issues.
7. Each PR should be linked to an existing issue corresponding to the PR
(see [PR template](./docs/pull_request_template.md)).
8. If there are no existing issues about a PR, create one before submitting the PR.
9. We strongly encourage the use of recommended code-style for the project
(see [code-style.xml](./docs/code-style.xml)).
10. A pre-commit CheckStyle hook can be run by adding `./checkstyle/checkstyle-pre-commit` to your `.git/hooks/pre-commit` script.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
BSD 2-CLAUSE LICENSE

Copyright 2017, 2018, 2019 LinkedIn Corporation.
Copyright 2017-2024 LinkedIn Corporation.
All Rights Reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
63 changes: 46 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
Cruise Control for Apache Kafka
===================

[![CI](https://github.com/linkedin/cruise-control/actions/workflows/ci.yaml/badge.svg)](https://github.com/linkedin/cruise-control/actions/workflows/ci.yaml)
[![CircleCI](https://circleci.com/gh/linkedin/cruise-control.svg?style=svg)](https://circleci.com/gh/linkedin/cruise-control)


### Introduction ###
Cruise Control is a product that helps run Apache Kafka clusters at large scale. Due to the popularity of
Apache Kafka, many companies have bigger and bigger Kafka clusters. At LinkedIn, we have 2.6K+ Kafka brokers,
Apache Kafka, many companies have increasingly large Kafka clusters with hundreds of brokers. At LinkedIn, we have 10K+ Kafka brokers,
which means broker deaths are an almost daily occurrence and balancing the workload of Kafka also becomes a big overhead.

Kafka Cruise Control is designed to address this operation scalability issue.
Kafka Cruise Control is designed to address this operational scalability issue.

### Features ###
Kafka Cruise Control provides the following features out of the box:
Expand All @@ -34,21 +36,35 @@ Cruise Control for Apache Kafka
* Broker failure detection
* Metric anomaly detection
* Disk failure detection (not available in `kafka_0_11_and_1_0` branch)
* Slow broker detection (not available in `kafka_0_11_and_1_0` branch)

* Admin operations, including:
* Add brokers
* Decommission brokers
* Remove brokers
* Demote brokers
* Rebalance the cluster
* Fix offline replicas (not available in `kafka_0_11_and_1_0` branch)
* Perform preferred leader election (PLE)
* Fix offline replicas

### Environment Requirements
* The current `master` branch of Cruise Control is compatible with Apache Kafka `1.1`, `2.0`, `2.1`, `2.2`, and `2.3` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.0.*`)
* The `kafka_0_11_and_1_0` branch of Cruise Control is compatible with Apache Kafka `0.11.0.0` and `1.0`(i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `0.1.*`)
* `message.format.version` `0.10.0` and above is needed
* The `master` and `kafka_0_11_and_1_0` branch compile with `Scala 2.11`
* Adjust replication factor

### Environment Requirements ###
* The `main` (previously `migrate_to_kafka_2_5`) branch of Cruise Control is compatible with Apache Kafka `2.5+` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.*`),
`2.6` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.11+`), `2.7` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.36+`),
`2.8` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.66+`), `3.0` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.85+`),
`3.1` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.85+`), `3.8` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.142+`),
`3.9` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.143+`), and `4.0` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.5.144+`)
* The `migrate_to_kafka_2_4` branch of Cruise Control is compatible with Apache Kafka `2.4` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.4.*`).
* The `kafka_2_0_to_2_3` branch (deprecated) of Cruise Control is compatible with Apache Kafka `2.0`, `2.1`, `2.2`, and `2.3` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `2.0.*`).
* The `kafka_0_11_and_1_0` branch (deprecated) of Cruise Control is compatible with Apache Kafka `0.11.0.0`, `1.0`, and `1.1` (i.e. [Releases](https://github.com/linkedin/cruise-control/releases) with `0.1.*`).
* `message.format.version` `0.10.0` and above is needed.
* The `kafka_2_0_to_2_3` and `kafka_0_11_and_1_0` branches compile with `Scala 2.11`.
* The branch `migrate_to_kafka_2_4` compiles with `Scala 2.12`.
* The branch `migrate_to_kafka_2_5` compile with `Scala 2.13`.
* This project requires Java 17.

#### Known Compatibility Issues ####
* Support for Apache Kafka `2.0`, `2.1`, `2.2`, and `2.3` requires [KAFKA-8875](https://issues.apache.org/jira/browse/KAFKA-8875) hotfix.

### Quick Start ###
0. Get Cruise Control
Expand All @@ -62,10 +78,10 @@ Cruise Control for Apache Kafka
&& git tag -a 0.1.10 -m "Init local version."`
1. This step is required if `CruiseControlMetricsReporter` is used for metrics collection (i.e. the default for Cruise
Control). The metrics reporter periodically samples the Kafka raw metrics on the broker and sends them to a Kafka topic.
* `./gradlew jar`
* `./gradlew jar` (Note: This project requires Java 17)
* Copy `./cruise-control-metrics-reporter/build/libs/cruise-control-metrics-reporter-A.B.C.jar` (Where `A.B.C` is
the version of the Cruise Control) to your Kafka server dependency jar folder. For Apache Kafka, the folder would
be `core/build/dependant-libs-SCALA_VERSION/`
be `core/build/dependant-libs-SCALA_VERSION/` (for a Kafka source checkout) or `libs/` (for a Kafka release download).
* Modify Kafka server configuration to set `metric.reporters` to
`com.linkedin.kafka.cruisecontrol.metricsreporter.CruiseControlMetricsReporter`. For Apache Kafka, server
properties are located at `./config/server.properties`.
Expand All @@ -76,9 +92,13 @@ Control). The metrics reporter periodically samples the Kafka raw metrics on the
* If the default broker cleanup policy is `compact`, make sure that the topic to which Cruise Control metrics
reporter should send messages is created with the `delete` cleanup policy -- the default metrics reporter topic
is `__CruiseControlMetrics`.
2. Start ZooKeeper and Kafka server ([See tutorial](https://kafka.apache.org/quickstart)).
2. Start Kafka server ([See tutorial](https://kafka.apache.org/quickstart)), and if you're using a ZooKeeper-based Kafka cluster also start a ZooKeeper server.
3. Modify `config/cruisecontrol.properties` of Cruise Control:
* (Required) fill in `bootstrap.servers` and `zookeeper.connect` to the Kafka cluster to be monitored.
* (Required) fill in `bootstrap.servers` to the Kafka cluster to be monitored.
* (Required) update `capacity.config.file` to the path of your capacity file.
* Capacity file is a JSON file that provides the capacity of the brokers
* You can start Cruise Control server with the default file (`config/capacityJBOD.json`), but it may not reflect the actual capacity of the brokers
* See [BrokerCapacityConfigurationFileResolver configurations](https://github.com/linkedin/cruise-control/wiki/Configurations#brokercapacityconfigurationfileresolver-configurations) for more information and examples
* (Optional) set `metric.sampler.class` to your implementation (the default sampler class is `CruiseControlMetricsReporterSampler`)
* (Optional) set `sample.store.class` to your implementation if you have one (the default `SampleStore` is `KafkaSampleStore`)
4. Run the following command
Expand Down Expand Up @@ -146,6 +166,9 @@ The default Sample Store implementation produces metric samples back to Kafka.
The goals in Cruise Control are pluggable with different priorities. The default goals in order of decreasing priority are:
* **RackAwareGoal** - Ensures that all replicas of each partition are assigned in a rack aware manner -- i.e. no more than one replica of
each partition resides in the same rack.
* **RackAwareDistributionGoal** - A relaxed version of `RackAwareGoal`. Contrary to `RackAwareGoal`, as long as replicas of each partition
can achieve a perfectly even distribution across the racks, this goal lets placement of multiple replicas of a partition into a single rack.
* **MinTopicLeadersPerBrokerGoal** - Ensures that each alive broker has at least a certain number of leader replica of each topic in a configured set of topics
* **ReplicaCapacityGoal** - Ensures that the maximum number of replicas per broker is under the specified maximum limit.
* **DiskCapacityGoal** - Ensures that Disk space usage of each broker is below a given threshold.
* **NetworkInboundCapacityGoal** - Ensures that inbound network utilization of each broker is below a given threshold.
Expand All @@ -172,8 +195,14 @@ The anomaly notifier allows users to be notified when an anomaly is detected. An
* Goal violation
* Metric anomaly
* Disk failure (not available in `kafka_0_11_and_1_0` branch)
* Slow brokers (not available in `kafka_0_11_and_1_0` branch)
* Topic replication factor anomaly (not available in `kafka_0_11_and_1_0` branch)
* Topic partition size anomaly (not available in `kafka_0_11_and_1_0` branch)
* Maintenance Events (not available in `kafka_0_11_and_1_0` branch)

In addition to anomaly notifications users can specify actions to be taken in response to an anomaly. The following actions are supported:
* **fix** - fix the problem right away
* **check** - check the situation again after a given delay
* **ignore** - ignore the anomaly
In addition to anomaly notifications, users can enable actions to be taken in response to an anomaly by turning self-healing
on for the relevant anomaly detectors. Multiple anomaly detectors work in harmony using distinct mitigation mechanisms.
Their actions broadly fall into the following categories:
* **fix** - fix the problem right away (e.g. start a rebalance, fix offline replicas)
* **check** - check the situation again after a configurable delay (e.g. adopt a grace period before fixing broker failures)
* **ignore** - ignore the anomaly (e.g. self-healing is disabled)
Loading