Improve and fix duration calculation and server status

alvarolopez · alvarolopez · commit 0e8cadb162a2 · 2019-07-08T09:01:31.000Z
This change introduces several changes and fixes, namely: - We now use properties for the duration calculation. When creating a record, the durations are calculated using properties, therefore no calculations are needed outside the record. - The duration reported for a server was negative in some cases. This was due to a couple of things. First of all, when some actions are performed on a server (resize, rebuild, evacuate), the server's start time is reset to the action's timestamp. Secondly, the way the nova API handles the changes-since and servers.list() calls and filters was not returing some servers, we now handle this as well. - When we were requesting old records (i.e. a republish) the server's status was set to "complete", however this was wrong, it has to be started. This change requires a complete republishing of all records. Fixes #62 Fixes #61
diff --git a/caso/extract/nova.py b/caso/extract/nova.py
@@ -49,7 +49,11 @@ def _get_glance_client(self, project):
         return glanceclient.client.Client(2, session=session)
 
     def build_record(self, server, vo, images, flavors, users):
+        server_start = self._get_server_start(server)
+        server_end = self._get_server_end(server)
+
         status = self.vm_status(server.status)
+
         image_id = None
         if server.image:
             image = images.get(server.image['id'])
@@ -62,8 +66,12 @@ def build_record(self, server, vo, images, flavors, users):
         if flavor:
             bench_name = flavor["extra"].get(CONF.benchmark_name_key)
             bench_value = flavor["extra"].get(CONF.benchmark_value_key)
+            memory = flavor["ram"]
+            cpu_count = flavor["vcpus"]
+            disk = flavor["disk"] + flavor["OS-FLV-EXT-DATA:ephemeral"]
         else:
             bench_name = bench_value = None
+            memory = cpu_count = disk = None
 
         if not all([bench_name, bench_value]):
             if any([bench_name, bench_value]):
@@ -76,24 +84,40 @@ def build_record(self, server, vo, images, flavors, users):
                           "file or set the correct properties in the "
                           "flavor." % flavor)
 
-        server_start = dateutil.parser.parse(server.created)
-        server_start = server_start.replace(tzinfo=None)
-
         r = record.CloudRecord(server.id,
                                CONF.site_name,
                                server.name,
                                server.user_id,
                                server.tenant_id,
                                vo,
                                start_time=server_start,
+                               end_time=server_end,
                                compute_service=CONF.service_name,
                                status=status,
                                image_id=image_id,
                                user_dn=users.get(server.user_id, None),
                                benchmark_type=bench_name,
-                               benchmark_value=bench_value)
+                               benchmark_value=bench_value,
+                               memory=memory,
+                               cpu_count=cpu_count,
+                               disk=disk)
         return r
 
+    @staticmethod
+    def _get_server_start(server):
+        # We use created, as the start_time may change upon certain actions!
+        server_start = dateutil.parser.parse(server.created)
+        server_start = server_start.replace(tzinfo=None)
+        return server_start
+
+    @staticmethod
+    def _get_server_end(server):
+        server_end = server.__getattr__('OS-SRV-USG:terminated_at')
+        if server_end:
+            server_end = dateutil.parser.parse(server_end)
+            server_end = server_end.replace(tzinfo=None)
+        return server_end
+
     def extract_for_project(self, project, extract_from, extract_to):
         """Extract records for a project from given date querying nova.
 
@@ -128,18 +152,30 @@ def extract_for_project(self, project, extract_from, extract_to):
 
         vo = self.voms_map.get(project)
 
-        # We cannot use 'changes-since' in the servers.list() API query, as it
-        # will only include changes that have changed its status after that
-        # date. However we cannot just get all the usages and then query
-        # server by server, as deleted servers are not returned  by this
-        # servers.get() call. What we do is the following:
+        # We cannot use just 'changes-since' in the servers.list() API query,
+        # as it will only include servers that have changed its status after
+        # that date. However we cannot just get all the usages and then query
+        # server by server, as deleted servers are not returned  by the usages
+        # call. Moreover, Nova resets the start_time after performing some
+        # actions on the server (rebuild, resize, rescue). If we use that time,
+        # we may get a drop in the wall time, as a server that has been resized
+        # in the middle of its lifetime will suddenly change its start_time
         #
-        # 1.- List all the deleted servers that changed after the start date
-        # 2.- Build the records for the period [start, end]
-        # 3.- Get all the usages
+        # Therefore, what we do is the following (hackish approach)
+        #
+        # 1.- List all the servers that changed its status after the start time
+        #     for the reporting period
+        # 2.- Build the records for the period [start, end] using those servers
+        # 3.- Get all the usages, being aware that the start time may be wrong
         # 4.- Iter over the usages and:
-        # 4.1.- get information for non deleted servers
-        # 4.2.- do nothing with deleted servers, as we collected in in step (2)
+        # 4.1.- get information for servers that are not returned by the query
+        #       in (1), for instance servers that have not changed it status.
+        #       We build then the records for those severs
+        # 4.2.- For all the servers, adjust the CPU, memory and disk resources
+        #       as the flavor may not exist, but we can get those resources
+        #       from the usages API.
+
+        # Lets start
 
         # 1.- List all the deleted servers from that period.
         servers = []
@@ -148,8 +184,7 @@ def extract_for_project(self, project, extract_from, extract_to):
         # Use a marker and iter over results until we do not have more to get
         while True:
             aux = nova.servers.list(
-                search_opts={"changes-since": extract_from,
-                             "deleted": True},
+                search_opts={"changes-since": extract_from},
                 limit=limit,
                 marker=marker
             )
@@ -165,20 +200,40 @@ def extract_for_project(self, project, extract_from, extract_to):
         # (we do this manually as we cannot limit the query to a period, only
         # changes after start date).
         for server in servers:
-            server_start = dateutil.parser.parse(server.created)
-            server_start = server_start.replace(tzinfo=None)
+
+            server_start = self._get_server_start(server)
+            server_end = self._get_server_end(server)
+
             # Some servers may be deleted before 'extract_from' but updated
             # afterwards
-            server_end = server.__getattr__('OS-SRV-USG:terminated_at')
-            if server_end:
-                server_end = dateutil.parser.parse(server_end)
-                server_end = server_end.replace(tzinfo=None)
             if (server_start > extract_to or
                     (server_end and server_end < extract_from)):
                 continue
+
             records[server.id] = self.build_record(server, vo, images,
                                                    flavors, users)
 
+            # Wall and CPU durations are absolute values, not deltas for the
+            # reporting period. The nova API only gives use the usages for the
+            # requested period, therefore we need to calculate the wall
+            # duration by ourselves, then multiply by the nr of CPUs to get the
+            # CPU duration.
+
+            # If the machine has not ended, report consumption until
+            # extract_to, otherwise get its consuption by substracting ended -
+            # started (done by the record).
+            if server_end is None or server_end > extract_to:
+                wall = extract_to - server_start
+                wall = int(wall.total_seconds())
+                records[server.id].wall_duration = wall
+                # If we are republishing, the machine reports status completed,
+                # but it is not True for this period, so we need to fake the
+                # status and remove the end time for the server
+                records[server.id].end_time = None
+
+                if records[server.id].status == "completed":
+                    records[server.id].status = self.vm_status("active")
+
         # 3.- Get all the usages for the period
         start = extract_from
         aux = nova.usage.get(project_id, start, extract_to)
@@ -190,46 +245,50 @@ def extract_for_project(self, project, extract_from, extract_to):
             if usage["instance_id"] not in records:
                 server = nova.servers.get(usage["instance_id"])
 
-                server_start = dateutil.parser.parse(server.created)
-                server_start = server_start.replace(tzinfo=None)
+                server_start = self._get_server_start(server)
                 if server_start > extract_to:
                     continue
-                records[server.id] = self.build_record(server, vo, images,
-                                                       flavors, users)
-            instance_id = usage["instance_id"]
-            records[instance_id].memory = usage["memory_mb"]
-            records[instance_id].cpu_count = usage["vcpus"]
-            records[instance_id].disk = usage["local_gb"]
-
-            # Start time must be the time when the machine was created
-            started = records[instance_id].start_time
-
-            # End time must ben the time when the machine was ended, but it may
-            # be none
-            if usage.get('ended_at', None) is not None:
-                ended = dateutil.parser.parse(usage["ended_at"])
-                records[instance_id].end_time = ended
-            else:
-                ended = None
-
-            # Wall and CPU durations are absolute values, not deltas for the
-            # reporting period. The nova API only gives use the usages for the
-            # requested period, therefore we need to calculate the wall
-            # duration by ourselves, then multiply by the nr of CPUs to get the
-            # CPU duration.
-
-            # If the machine has not ended, report consumption until
-            # extract_to, otherwise get its consuption by substracting ended -
-            # started.
-            if ended is not None and ended < extract_to:
-                wall = ended - started
-            else:
-                wall = extract_to - started
-
-            wall = int(wall.total_seconds())
-            records[instance_id].wall_duration = wall
-
-            cput = wall * usage["vcpus"]
-            records[instance_id].cpu_duration = cput
+                record = self.build_record(server, vo, images,
+                                           flavors, users)
+
+                server_start = record.start_time
+
+                # End time must ben the time when the machine was ended, but it
+                # may be none
+                if usage.get('ended_at', None) is not None:
+                    server_end = dateutil.parser.parse(usage["ended_at"])
+                    record.end_time = server_end
+                else:
+                    server_end = None
+
+                # Wall and CPU durations are absolute values, not deltas for
+                # the reporting period. The nova API only gives use the usages
+                # for the requested period, therefore we need to calculate the
+                # wall duration by ourselves, then multiply by the nr of CPUs
+                # to get the CPU duration.
+
+                # If the machine has not ended, report consumption until
+                # extract_to, otherwise get its consuption by substracting
+                # ended - started (done by the record).
+                if server_end is None or server_end > extract_to:
+                    wall = extract_to - server_start
+                    wall = int(wall.total_seconds())
+                    record.wall_duration = wall
+                    # If we are republishing, the machine reports status
+                    # completed, but it is not True for this period, so we need
+                    # to fake the status
+                    if records[server.id].status == "completed":
+                        records[server.id].status = self.vm_status("active")
+
+                cput = wall * usage["vcpus"]
+                record.cpu_duration = cput
+
+                records[server.id] = record
+
+            # Adjust resources that may not be
+            record = records[usage["instance_id"]]
+            record.memory = usage["memory_mb"]
+            record.cpu_count = usage["vcpus"]
+            record.disk = usage["local_gb"]
 
         return records
diff --git a/caso/record.py b/caso/record.py
@@ -130,6 +130,36 @@ def as_dict(self, version=None):
         return {k: v for k, v in self.map.items()
                 if k in self._version_field_map[version]}
 
+    @property
+    def wall_duration(self):
+        duration = None
+        if self._wall_duration is not None:
+            duration = self._wall_duration
+        elif None not in (self._start_time, self._end_time):
+            duration = (self.end_time - self.start_time).total_seconds()
+        return int(duration) if duration else duration
+
+    @wall_duration.setter
+    def wall_duration(self, value):
+        if value and not isinstance(value, (int, float)):
+            raise ValueError("Duration must be a number")
+        self._wall_duration = value
+
+    @property
+    def cpu_duration(self):
+        duration = None
+        if self._cpu_duration is not None:
+            duration = self._cpu_duration
+        elif self.wall_duration is not None and self.cpu_count:
+            duration = self.wall_duration * self.cpu_count
+        return int(duration) if duration else duration
+
+    @cpu_duration.setter
+    def cpu_duration(self, value):
+        if value and not isinstance(value, (int, float)):
+            raise ValueError("Duration must be a number")
+        self._cpu_duration = value
+
     @property
     def start_time(self):
         return self._start_time