void* printingFunction(void* args) {
// We have to know what type is being passed.
size_t thread_id = *(size_t*)(args);
printf("Thread %lu is running!\n", thread_id);
return NULL;
}
int main(void) {
// Let's make some threads!
thrd_t threads[10];
size_t numbers[10];
// Start some threads.
for (int i = 0; i < 10; ++i) {
numbers[i] = i;
int result = thrd_create(&threads[i], (thrd_start_t)printingFunction, &numbers[i]);
if (result == thrd_error) {
printf("Error creating thread %lu\n", numbers[i]);
}
}
// Wait for all threads to finish.
for (int i = 0; i < 10; ++i) {
int res = thrd_join(threads[i], NULL);
if (res == thrd_error) {
printf("Error: thread %lu failed to join\n", numbers[i]);
}
}
return 0;
}
```
---
## Output
$ ./a.out
Thread 1 is running!
Thread 2 is running!
Thread 4 is running!
Thread 0 is running!
Thread 3 is running!
Thread 6 is running!
Thread 5 is running!
Thread 7 is running!
Thread 8 is running!
Thread 9 is running!
---
## Thread Execution
* Notice that the order of the threads is different from when we started them
* Any synchronization is up to the programmer
* This makes threaded programming tricky
---
## Memory Safety
* There are some ways to coordinate between threads
* But I'll leave those for your OS or parallel computing class
* Just know that we'll need to avoid having threads write to the same place
* Reading is okay though
---
## Parallelizing Matrix Multiply
* Let's make a thread for each row
* So the outer loop will go away
* Each thread operates on a row of the left matrix and output
* That will replace the `r` variable and loop in the code
```c
size_t num_elem = left.height;
for (size_t r = 0; r < product.height; ++r) {
for (size_t c = 0; c < product.width; ++c) {
double sum = 0.0;
for (size_t e = 0; e < num_elem; ++e) {
sum += left.rows[r][e] * right.rows[e][c];
}
product.rows[r][c] = sum;
}
}
return product;
```
---
## Avoid Contention
* The threads are going to all be writing simultaneously
* So we need to pick a version of matrix multiply that only writes into a single index of the destination at a time
```c
size_t num_elem = left.height;
for (size_t r = 0; r < product.height; ++r) {
for (size_t c = 0; c < product.width; ++c) {
double sum = 0.0;
for (size_t e = 0; e < num_elem; ++e) {
sum = left.rows[r][e] * right.rows[e][c];
}
product.rows[r][c] += sum;
}
}
```
---
## Contention Free
* Each thread will write to `product.rows[r][c]`
* One thread per `r`
* Each thread will loop over `c`
* Forget about locality for a moment
---
## Thread Function
```C
void* sum_into_ce(void* void_args) {
void ** args = (void**)void_args;
size_t thread_id = (size_t)(args[0]);
double* row = (double*)(args[1]);
double* left_row = (double*)(args[2]);
Matrix* right = (Matrix*)(args[3]);
int width = *(int*)(args[4]);
int height = *(int*)(args[5]);
for (size_t c = 0; c < width; ++c) {
double sum = 0.0;
for (size_t e = 0; e < height; ++e) {
sum += left_row[e] * right->rows[e][c];
}
row[c] = sum;
}
return NULL;
}
```
---
## Calling Function
```c
Matrix dotProduct_thread_rce(Matrix left, Matrix right) {
// If the sizes don't match, return a matrix with NULL data and 0 size.
if (left.width != right.height) {
Matrix empty = newMatrix(0, 0);
return empty;
}
Matrix product = newMatrix(left.height, right.width);
// Go row by row on the left and column by column on the right
// Assume that all matrices are square, so the number of ops is either the height or the width
size_t num_elem = left.height;
size_t r, c, e;
// We must send the outer loop to the threads, so the outer loop must be 'e'
// Will create one thread per row
thrd_t threads[product.height];
void* args[product.height][6];
for (size_t r = 0; r < product.height; ++r) {
args[r][0] = (void*)r;
args[r][1] = (void*)(product.rows[r]);
args[r][2] = (void*)(left.rows[r]);
args[r][3] = (void*)(&right);
args[r][4] = (void*)(&product.width);
args[r][5] = (void*)(&product.height);
int res = thrd_create(&threads[r], (thrd_start_t)sum_into_ce, args[r]);
if (res == thrd_error) {
printf("Error: thread create failed on %lu\n", r);
}
}
for (size_t r = 0; r < product.height; ++r) {
int res = thrd_join(threads[r], NULL);
if (res == thrd_error) {
printf("Error: thread %lu failed to join\n", r);
}
}
return product;
}
```
---
### CPU Time
Slower? Sure, in total CPU time. The random access is causing cache misses.
---
## Wall Time
But the pipeline is so full, we run faster in wall time.
---
### CPU Time (Ryzen9)
---
## Wall Time (Ryzen9)
---
## Threading Vs Cache
* There is a tension between parallelism and locality
* It is harder to guess access patterns and prefetch when lots of threads hit the memory
* Instructions coming from multiple places
* data access all over the place
* But if a thread is waiting for data to load, we can just load another thread
* And if the threads share data, it should end up in the shared L3 cache
---
## Where is L3?
L3 isn't per-CPU, it's shared. Notice how it doesn't appear in the CPU block diagram.
$^*$ Source: https://docs.amd.com/v/u/en-US/58455_1.00
---
## L3 Cache
* In a modern, multi-core CPU the L3 cache is optimized for threading
* The L3 is populated by L2 victims (lines removed after conflict misses)
* In Zen5, L3 hits move the line back to L2 on writes or if hits are from a single core
* Lines remain in L3 after reads or hits from multiple cores
* In addition, if there is an L2 and L3 miss, L3 will grab the data from another L2 if possible
$^*$ Source: https://docs.amd.com/v/u/en-US/58455_1.00
---
## Improving Thread Locality
* We can get the best of both worlds if we are clever
* Notice that reducing the number of threads improved cache miss rates
* At least until the matrix was too large and the threads operated on distant data
* Aruond 1250x1250 matrix
* How could we improve locality?
* Maybe with blocking
* Or how about transposing the matrix first?
---
## Threaded Transposition
```c
void* sum_into_transposed(void* void_args) {
void ** args = (void**)void_args;
size_t thread_id = (size_t)(args[0]);
double* row = (double*)(args[1]);
double* left_row = (double*)(args[2]);
Matrix* right = (Matrix*)(args[3]);
size_t width = *(int*)(args[4]);
size_t elements = *(int*)(args[5]);
for (size_t c = 0; c < width; ++c) {
double sum = 0.0;
for (size_t e = 0; e < elements; ++e) {
// Unthreaded: sum += left.rows[r][e] * transpose_right.rows[c][e];
sum += left_row[e] * right->rows[c][e];
}
// Unthreaded: product.rows[r][c] += sum;
row[c] += sum;
}
return NULL;
}
Matrix dotProduct_thread_transpose_rce(Matrix left, Matrix right) {
// If the sizes don't match, return a matrix with NULL data and 0 size.
if (left.width != right.height) {
Matrix empty = newMatrix(0, 0);
return empty;
}
// Make a transposed matrix
thrd_t threads[max_threads];
void* t_args[max_threads][5];
Matrix transpose_right = newMatrix(right.width, right.height);
for (size_t r = 0; r < right.height; r+=max_threads) {
// Making this threaded: transpose_right.rows[r][c] = right.rows[c][r];
// Start a new batch of threads
for (size_t t = 0; t < max_threads && r+t
---
## Threaded Transpose Wall
---
## Threaded Transpose CPU (Ryzen9)
---
## Threaded Transpose Wall (Ryzen9)
---
## Threads and Multicore
* Desktop and datacenters architectures are all multicore
* To get value from them
* Run multiple processes
* Run multithreaded programs
---
## Parallelism
* It's come up before!
* Speedups from longer pipelines eventually reached their limits
* Then architectures became more parallel
---
## Not Just GPUs
* Currently, GPUs are the obvious example of parallel computation
* However, CPUs also have parallel features
* L3 came out after multi-core chips
* Some instructions are explicitly parallel
* We'll go over some more examples next time