@inproceedings{1746d8c60e30439cb5b33bfa0cc64416,
title = "Performance engineering for a tall & skinny matrix multiplication kernels on GPUs",
abstract = "General matrix-matrix multiplications (GEMM) in vendor-supplied BLAS libraries are best optimized for square matrices but often show bad performance for tall & skinny matrices, which are much taller than wide. Nvidia{\textquoteright}s current CUBLAS implementation delivers only a fraction of the potential performance (as given by the roofline model) in this case. We describe the challenges and key properties of an implementation that can achieve perfect performance. We further evaluate different approaches of parallelization and thread distribution, and devise a flexible, configurable mapping scheme. A code generation approach enables a simultaneously flexible and specialized implementation with autotuning. This results in perfect performance for a large range of matrix sizes in the domain of interest, and at least 2/3 of maximum performance for the rest on an Nvidia Volta GPGPU.",
keywords = "GPU, Matrix multiplication, Tall & skinny",
author = "Dominik Ernst and Georg Hager and Jonas Thies and Gerhard Wellein",
year = "2020",
doi = "10.1007/978-3-030-43229-4_43",
language = "English",
isbn = "9783030432287",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Nature",
pages = "505--515",
editor = "Roman Wyrzykowski and Konrad Karczewski and Ewa Deelman and Jack Dongarra",
booktitle = "Parallel Processing and Applied Mathematics - 13th International Conference, PPAM 2019, Revised Selected Papers",
note = "13th International Conference on Parallel Processing and Applied Mathematics, PPAM 2019 ; Conference date: 08-09-2019 Through 11-09-2019",
}