Skip to content

Commit bc6ec83

Browse files
Merge pull request #142 from matyasselmeci/pr/estimated_cpucount.SOFTWARE-6197
Add estimated_cpucount (SOFTWARE-6197)
2 parents e92ff42 + c9f5ee8 commit bc6ec83

File tree

6 files changed

+39
-10
lines changed

6 files changed

+39
-10
lines changed

README.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -432,15 +432,16 @@ Valid values for the **os** option are: `rhel6`, `rhel7`, `rhel8`, or `ubuntu18`
432432

433433
The following attributes are optional:
434434

435-
| Option | Values Accepted | Explanation |
436-
|---------------------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------|
437-
| **cpucount** | Positive Integer | Number of cores that a job using this type of pilot can get. Default `1`; ignored if **whole\_node** is `True` |
438-
| **ram\_mb** | Positive Integer | Maximum amount of memory (in MB) that a job using this type of pilot can get. Default `2500`; ignored if **whole\_node** is `True` |
439-
| **whole\_node** | `True`, `False` | Whether this type of pilot can use all the resources on a node. Default `False`; **cpucount** and **ram\_mb** are ignored if this is `True` |
440-
| **gpucount** | Non-negative Integer | The number of GPUs to request. Default `0` |
441-
| **max\_wall\_time** | Positive Integer | Maximum wall-clock time, in minutes, that a job is allowed to run on this resource. Default `1440`, i.e. 24 hours |
442-
| **queue** | String | The queue or partition which jobs should be submitted to in order to run on this resource (see note). Not set by default |
443-
| **send\_tests** | `True`, `False` | Send test pilots. Default `False`; set it to `True` for testing job routes or pilot types |
435+
| Option | Values Accepted | Explanation |
436+
|---------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
437+
| **cpucount** | Positive Integer | Number of cores that a job using this type of pilot can get. Default `1`; ignored if **whole\_node** is `True` |
438+
| **ram\_mb** | Positive Integer | Maximum amount of memory (in MB) that a job using this type of pilot can get. Default `2500`; ignored if **whole\_node** is `True` |
439+
| **whole\_node** | `True`, `False` | Whether this type of pilot can use all the resources on a node. Default `False`; **cpucount** and **ram\_mb** are ignored if this is `True`; **estimated\_cpucount** is ignored if this is `False` |
440+
| **estimated\_cpucount** | Positive Integer | The number of CPUs we expect on average when requesting a whole node. Ignored if **whole\_node** is `False` |
441+
| **gpucount** | Non-negative Integer | The number of GPUs to request. Default `0` |
442+
| **max\_wall\_time** | Positive Integer | Maximum wall-clock time, in minutes, that a job is allowed to run on this resource. Default `1440`, i.e. 24 hours |
443+
| **queue** | String | The queue or partition which jobs should be submitted to in order to run on this resource (see note). Not set by default |
444+
| **send\_tests** | `True`, `False` | Send test pilots. Default `False`; set it to `True` for testing job routes or pilot types |
444445

445446
**Note:** **queue** is equivalent to the HTCondor grid universe classad attribute **remote\_queue**.
446447

config/35-pilot.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
;ram_mb = 2500
2424
;; This is a whole node pilot; cpucount and ram_mb are ignored if this is true.
2525
;whole_node = false
26+
;; This is the number of CPUs available on a whole note pilot; this is ignored if whole_node is false.
27+
;estimated_cpucount = 8
2628
;; The number of GPUs available
2729
;gpucount = 0
2830
;; The maximum number of pilots of this type that can be sent

osg_configure/modules/resourcecatalog.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class RCAttribute(namedtuple("RCAttribute", "rce_field classad_attr format_fn"))
2929
RCAttribute("name", "Name", utilities.classad_quote),
3030
RCAttribute("cpus", "CPUs", int),
3131
RCAttribute("memory", "Memory", int),
32+
RCAttribute("estimated_cpucount", "EstimatedCPUs", int),
3233
RCAttribute("allowed_vos", "AllowedVOs", _to_classad_list),
3334
RCAttribute("max_wall_time", "MaxWallTime", int),
3435
RCAttribute("queue", "Queue", utilities.classad_quote),
@@ -65,6 +66,7 @@ def __init__(self, **kwargs):
6566
self.os = kwargs.get('os', None)
6667
self.send_tests = kwargs.get('send_tests', None)
6768
self.is_pilot = kwargs.get('is_pilot', None)
69+
self.estimated_cpucount = kwargs.get('estimated_cpucount', None)
6870

6971
def get_requirements(self, attributes):
7072
if self.is_pilot:

osg_configure/modules/subcluster.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"subclusters": (OPTIONAL, LIST),
3434
"vo_tag": (OPTIONAL, STRING),
3535
# added for Pilots
36+
"estimated_cpucount": (OPTIONAL, POSITIVE_INT),
3637
"gpucount": (OPTIONAL, POSITIVE_INT),
3738
"max_pilots": (REQUIRED_FOR_PILOT, POSITIVE_INT),
3839
"os": (OPTIONAL, STRING),
@@ -72,6 +73,7 @@
7273
'maxmemory': (512, 8388608),
7374
'cores_per_node': (1, 8192),
7475
'cpucount': (1, 8192),
76+
'estimated_cpucount': (1, 8192),
7577
}
7678

7779
CPUCOUNT_DEFAULT = 1
@@ -219,7 +221,7 @@ def rce_section_get_name(config: ConfigParser, section: str) -> Optional[str]:
219221
"""
220222
m = re.search(r"(?i:subcluster|resource entry|pilot)\s+(.+)", section)
221223
if not m:
222-
return
224+
return None
223225
default_name = m.group(1)
224226
return config[section].get("name", default_name).strip()
225227

@@ -235,6 +237,7 @@ def format_value(self) -> str:
235237
def resource_catalog_from_config(config: ConfigParser, default_allowed_vos: List[str] = None) -> ResourceCatalog:
236238
"""
237239
Create a ResourceCatalog from the subcluster entries in a config
240+
:param config: The config to pull the subcluster information from
238241
:param default_allowed_vos: The allowed_vos to use if the user specified "*"
239242
"""
240243
logger = logging.getLogger(__name__)
@@ -321,9 +324,12 @@ def safegetbool(option: str, default=None) -> bool:
321324
if is_pilot(section):
322325
rcentry.max_pilots = safeget("max_pilots")
323326
rcentry.whole_node = safegetbool("whole_node", False)
327+
rcentry.estimated_cpucount = safeget("estimated_cpucount")
324328
if rcentry.whole_node:
325329
rcentry.cpus = None
326330
rcentry.memory = None
331+
else:
332+
rcentry.estimated_cpucount = None
327333
rcentry.require_singularity = safegetbool("require_singularity", True)
328334
rcentry.os = safeget("os")
329335
rcentry.send_tests = safegetbool("send_tests", True)

tests/configs/subcluster/pilots_example.ini

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ max_wall_time = 1440
1313
allowed_vos = atlas
1414
max_pilots = 1000
1515

16+
[Pilot WholeNode_with_estimated_cpucount]
17+
whole_node = true
18+
estimated_cpucount = 32
19+
max_wall_time = 1440
20+
allowed_vos = atlas
21+
max_pilots = 1000
22+
1623
[Pilot default]
1724
ram_mb = 32768
1825
os = rhel6

tests/test_resourcecatalog.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,17 @@ def testPilotExample(self):
326326
SendTests = True; \
327327
WholeNode = True; \
328328
], \
329+
[ \
330+
AllowedVOs = { "atlas" }; \
331+
EstimatedCPUs = 32; \
332+
IsPilotEntry = True; \
333+
MaxPilots = 1000; \
334+
MaxWallTime = 1440; \
335+
Name = "WholeNode_with_estimated_cpucount"; \
336+
RequireSingularity = True; \
337+
SendTests = True; \
338+
WholeNode = True; \
339+
], \
329340
[ \
330341
AllowedVOs = { "osg", "cms" }; \
331342
CPUs = 8; \

0 commit comments

Comments
 (0)