diff --git a/azurelinuxagent/ga/env.py b/azurelinuxagent/ga/env.py index 6ffca23bb..424221562 100644 --- a/azurelinuxagent/ga/env.py +++ b/azurelinuxagent/ga/env.py @@ -110,21 +110,29 @@ class FirewallState(object): class EnableFirewall(PeriodicOperation): - _REPORTING_PERIOD = datetime.timedelta(hours=24) # we set a limit on the number of messages logged within this period + _REPORTING_PERIOD = datetime.timedelta(hours=24) # We set limits on the number of reports for this period. Limits are reset after the period elapses. def __init__(self, wire_server_address): super(EnableFirewall, self).__init__(conf.get_enable_firewall_period()) self._wire_server_address = wire_server_address self._firewall_manager = None # initialized on demand in the _operation method self._firewall_state = FirewallState.OK # Initialized to OK to prevent turning on verbose mode on the initial invocation of _operation(). It is properly initialized as soon as we do the first check of the firewall. - self._report_count = 0 - self._next_report_time = datetime.datetime.now(UTC) + # + # We use the below members to limit the number of reports we emit during a reporting period (a report consist of all the telemetry events/logging emitted during 1 execution of the _operation() method). + # When the firewall state is OK we emit 1 single report per period, otherwise we emit up to 3 reports per period. The self._report_count member is used to set this limit. + # However, if the state of the firewall changes after the limit has been exceeded, we want to know immediately, not until the full reporting period has elapsed. So, when the state changes, we reset self._report_count. + # Now, if the state of the firewall changes multiple times during the same reporting period, the above strategy can produce to many reports, so we set an absolute limit per period. The self_period_report_count member + # is used to set that limit and is reset each reporting period. + # + self._reporting_period_end = datetime.datetime.now(UTC) + EnableFirewall._REPORTING_PERIOD + self._report_count = 0 # we can reset this count more than once per period depending on the state of the firewall + self._period_report_count = 0 # this count is reset only once per period self._should_report = True def _operation(self): try: # - # Each check of the firewall can produce a report, but we limit the number of reports we emit within a fix period of time. + # We check the reporting limits and set self._should_report at the beginning of the method. Note that self._report_count and self._should_report can be reset to their initial values if the state of the firewall changes. # self._update_reporting_state() @@ -140,49 +148,52 @@ def _operation(self): try: if self._firewall_manager.check(): self._update_firewall_state(FirewallState.OK) - self._report(event.info, WALAEventOperation.Firewall, "The firewall is configured correctly. Current state:\n{0}", self._firewall_manager.get_state()) + self._emit_event(event.info, WALAEventOperation.Firewall, "The firewall is configured correctly. Current state:\n{0}", self._firewall_manager.get_state()) return self._update_firewall_state(FirewallState.NotSet) - self._report(event.warn, WALAEventOperation.Firewall, "The firewall has not been setup. Will set it up.") + self._emit_event(event.warn, WALAEventOperation.Firewall, "The firewall has not been setup. Will set it up.") except IptablesInconsistencyError as e: self._update_firewall_state(FirewallState.Inconsistent) - self._report(event.warn, WALAEventOperation.FirewallInconsistency, "The results returned by iptables are inconsistent, will not change the current state of the firewall: {0}", ustr(e)) + self._emit_event(event.warn, WALAEventOperation.FirewallInconsistency, "The results returned by iptables are inconsistent, will not change the current state of the firewall: {0}", ustr(e)) return except FirewallStateError as e: self._update_firewall_state(FirewallState.Invalid) - self._report(event.warn, WALAEventOperation.ResetFirewall, "The firewall is not configured correctly. {0}. Will reset it. Current state:\n{1}", ustr(e), self._firewall_manager.get_state()) + self._emit_event(event.warn, WALAEventOperation.ResetFirewall, "The firewall is not configured correctly. {0}. Will reset it. Current state:\n{1}", ustr(e), self._firewall_manager.get_state()) self._firewall_manager.remove() self._firewall_manager.setup() self._update_firewall_state(FirewallState.OK) - self._report(event.info, WALAEventOperation.Firewall, "The firewall was setup successfully:\n{0}", self._firewall_manager.get_state()) + self._emit_event(event.info, WALAEventOperation.Firewall, "The firewall was setup successfully:\n{0}", self._firewall_manager.get_state()) except Exception as e: self._update_firewall_state(FirewallState.Unknown) - self._report(event.warn, WALAEventOperation.Firewall, "An error occurred while verifying the state of the firewall: {0}. Current state:\n{1}", textutil.format_exception(e), self._firewall_manager.get_state()) + self._emit_event(event.warn, WALAEventOperation.Firewall, "An error occurred while verifying the state of the firewall: {0}. Current state:\n{1}", textutil.format_exception(e), self._firewall_manager.get_state()) - def _report(self, report_function, operation, message, *args): + def _emit_event(self, event_function, operation, message, *args): if self._should_report: - report_function(operation, message, *args) + event_function(operation, message, *args) def _update_reporting_state(self): - if self._next_report_time > datetime.datetime.now(UTC): - self._should_report = False - return + if datetime.datetime.now(UTC) >= self._reporting_period_end: # Reset the all the report counts every time a reporting period has elapsed + self._report_count = 0 + self._period_report_count = 0 + self._reporting_period_end = datetime.datetime.now(UTC) + EnableFirewall._REPORTING_PERIOD + # if the state of the firewall does not change within a reporting period, we report only once when the state is OK and max 3 times when it is not. + # if the state changes, we report at most 8 times total. self._report_count += 1 + self._period_report_count += 1 max_reports = 1 if self._firewall_state == FirewallState.OK else 3 - if self._report_count <= max_reports: - self._should_report = True - else: - self._report_count = 0 - self._next_report_time = datetime.datetime.now(UTC) + self._REPORTING_PERIOD - self._should_report = False + self._should_report = self._report_count <= max_reports def _update_firewall_state(self, firewall_state): - if (self._firewall_state == FirewallState.OK) != (firewall_state == FirewallState.OK): # reset the reporting limits if the firewall state goes from OK to not-OK or vice versa - self._report_count = 0 - self._next_report_time = datetime.datetime.now(UTC) - self._should_report = True + if (self._firewall_state == FirewallState.OK) != (firewall_state == FirewallState.OK): + # Reset the self._report_count and self._should_report to their initial values if the firewall state goes from OK to not-OK or vice versa. We do this because we want to know about state changes + # immediately, even if the report count for the current period has already been exceeded. Otherwise, we would need to wait for the entire reporting period to report those changes. + # Now, if the state of the firewall changes too often, this strategy may emit too many events, so we use self._period_report_count, to set an absolute limit that is only reset when the full + # period elapses. + if self._period_report_count <= 8: + self._report_count = 0 + self._should_report = True self._firewall_state = firewall_state diff --git a/tests/ga/test_env.py b/tests/ga/test_env.py index 6266d9c0b..6021905d3 100644 --- a/tests/ga/test_env.py +++ b/tests/ga/test_env.py @@ -22,7 +22,7 @@ from azurelinuxagent.ga.env import MonitorDhcpClientRestart, EnableFirewall from tests.lib.event import get_events_from_mock -from tests.lib.tools import AgentTestCase, patch, DEFAULT +from tests.lib.tools import AgentTestCase, patch, DEFAULT, Mock from tests.lib.mock_firewall_command import MockIpTables @@ -220,8 +220,8 @@ def test_it_should_not_modify_the_firewall_rules_when_the_check_command_is_incon def test_it_should_log_the_state_of_the_firewall_once_per_reporting_period(self): with MockIpTables() as mock_iptables: + EnableFirewall._REPORTING_PERIOD = datetime.timedelta(milliseconds=500) enable_firewall = EnableFirewall('168.63.129.16') - enable_firewall._REPORTING_PERIOD = datetime.timedelta(milliseconds=500) with patch.multiple("azurelinuxagent.ga.firewall_manager.event", info=DEFAULT, warn=DEFAULT, error=DEFAULT) as patches: info = patches["info"] @@ -262,8 +262,8 @@ def test_it_should_log_errors_thrice_per_reporting_period(self): with MockIpTables(check_matches_list=False) as mock_iptables: mock_iptables.set_return_values("-C", accept_dns=0, accept=0, drop=1, legacy=0) + EnableFirewall._REPORTING_PERIOD = datetime.timedelta(milliseconds=500) enable_firewall = EnableFirewall('168.63.129.16') - enable_firewall._REPORTING_PERIOD = datetime.timedelta(milliseconds=500) firewall_manager_in_verbose_mode = [] @@ -324,3 +324,46 @@ def test_it_should_log_errors_thrice_per_reporting_period(self): self.assertEqual(0, error.call_count, "No errors should have been reported. Got: {0}". format(error.call_args_list)) + def test_it_should_set_a_limit_on_the_number_of_reports_when_the_firewall_state_changes(self): + EnableFirewall._REPORTING_PERIOD = datetime.timedelta(milliseconds=500) + enable_firewall = EnableFirewall('168.63.129.16') + + call_count = [0] + + def mock_check(*_, **__): + call_count[0] += 1 + return call_count[0] % 2 == 0 # flip between False (the firewall has not been set up) and True (the firewall is OK) and on each call + + enable_firewall._firewall_manager = Mock() + enable_firewall._firewall_manager.check = Mock(side_effect=mock_check) + enable_firewall._firewall_manager.get_state = Mock(return_value='*** mock state***') + + # E0601: Using variable 'add_event_patch' before assignment (used-before-assignment) + get_firewall_events = lambda: [(kwargs["is_success"], kwargs['message']) for _, kwargs in add_event_patch.call_args_list if 'Firewall' in kwargs["op"]] # pylint: disable=E0601 + + # We expect a maximum of 8 reports per reporting period. + # The mock for check() flips between False and True; the former produces a report with a WARNING (is_success == False) and an INFO (is_success == True); the latter produces 1 INFO + expected = 4 * [ + (False, '[WARNING] The firewall has not been setup. Will set it up.'), (True, 'The firewall was setup successfully:\n*** mock state***'), + (True, 'The firewall is configured correctly. Current state:\n*** mock state***') + ] + + # First reporting period + with patch("azurelinuxagent.common.event.add_event") as add_event_patch: + for _ in range(0, 12): + enable_firewall._operation() + + self.assertEqual(12, call_count[0], "Expected 12 calls to FirewallManager.check() during the first reporting period") + firewall_events = get_firewall_events() + self.assertEqual(expected, firewall_events, "First reporting period: Expected 1 WARNING (is_success == False) and 1 INFOs (is_success == True), then 1 more INFO, repeated 4 times") + + time.sleep(0.5) + + # Second reporting period + with patch("azurelinuxagent.common.event.add_event") as add_event_patch: + for _ in range(0, 10): + enable_firewall._operation() + + self.assertEqual(22, call_count[0], "Expected a total of 22 calls to FirewallManager.check() after the second reporting period") + firewall_events = get_firewall_events() + self.assertEqual(expected, firewall_events, "Second reporting period: Expected 1 WARNING (is_success == False) and 1 INFOs (is_success == True), then 1 more INFO, repeated 4 times")