@inproceedings{3f88f184a366400aa252cf47ac2d782d,
title = "Courier: Real-Time Optimal Batch Size Prediction for Latency SLOs in BigDL",
abstract = "Distributed machine learning has seen immense rise in popularity in recent years. Many companies and universities are utilizing computational clusters to train and run machine learning models. Unfortunately, operating such a cluster imposes large costs. It is therefore crucial to attain as high system utilization as possible. Moreover, those who offer computational clusters as a service, apart from keeping high utilization, also have to meet the required Service Level Agreements (SLAs) for the system response time. This becomes increasingly more complex in multitenant scenarios, where the time dedicated to each task has to be limited to achieve fairness. In this work, we analyze how different parameters of the machine learning job influence the response time as well as system utilization and propose Courier. Courier is a model that, based on the type of machine learning job, can select a batch size such that the response time adheres to the Service Level Objectives (SLOs) specified, while also rendering the highest possible accuracy. We gather the data by conducting real-world experiments on a BigDL cluster. Later on, we study the influence of the factors and build several predictive models which lead us to the proposed Courier model.",
keywords = "deep learning, distributed systems, hyperparameter optimization, provisioning, resource management, scheduling",
author = "{Albo Mart{\'i}nez}, Diego and Sharwin Bobde and Tomasz Motyka and Lydia Chen",
year = "2021",
doi = "10.1145/3427921.3450233",
language = "English",
series = "ICPE 2021 - Proceedings of the ACM/SPEC International Conference on Performance Engineering",
publisher = "Association for Computing Machinery (ACM)",
pages = "133--144",
booktitle = "ICPE 2021 - Proceedings of the ACM/SPEC International Conference on Performance Engineering",
address = "United States",
note = "2021 ACM/SPEC International Conference on Performance Engineering, ICPE 2021 ; Conference date: 19-04-2021 Through 21-04-2021",
}