#include "hw3_matrix_lib.h"
// Converts the timespect to milliseconds
typedef struct timespec timespec;
double timespecToMs(timespec t) {
return 1000.0 * t.tv_sec + 1e-6 * t.tv_nsec;
}
int main(int argc, char** argv) {
if (argc != 3 && argc != 4) {
printf("Provide an option for which loop to run. Options are 1-12.\n");
printf("Provide a matrix size greater than or equal to 10.\n");
printf("For algorithm 11, also provide a number of threads (defaults to 6).\n");
return 0;
}
int opt = atoi(argv[1]);
if (opt < 1 || opt > 13) {
printf("Provide an option for which loop to run. Options are 1-12.\n");
return 0;
}
int size = atoi(argv[2]);
if (size < 10) {
printf("Provide a matrix size greater than or equal to 10.\n");
return 0;
}
if (argc > 3) {
// This variable is declared in hw3_matrix_lib.h
// Global variables like this should generally be avoided as they aren't clear.
max_threads = atoi(argv[3]);
}
// This is a function type. Don't worry about it.
typedef Matrix (*mm_func)(Matrix, Matrix);
mm_func mmul = NULL;
switch (opt) {
case 1:
mmul = dotProduct_rce;
printf("Testing rce\n");
break;
case 2:
mmul = dotProduct_rec;
printf("Testing rec\n");
break;
case 3:
mmul = dotProduct_cre;
printf("Testing cre\n");
break;
case 4:
mmul = dotProduct_cer;
printf("Testing cer\n");
break;
case 5:
mmul = dotProduct_erc;
printf("Testing erc\n");
break;
case 6:
mmul = dotProduct_ecr;
printf("Testing ecr\n");
break;
case 7:
mmul = dotProduct_block;
printf("Testing block\n");
break;
case 8:
mmul = dotProduct_transpose_rce;
printf("Testing transpose rce\n");
break;
case 9:
mmul = dotProduct_thread_rce;
printf("Testing thread rce\n");
break;
case 10:
mmul = dotProduct_thread_cre;
printf("Testing thread cre\n");
printf("Note! CRE should be using a mutex, but currently is not.\n");
break;
case 11:
mmul = dotProduct_thread_rce_max;
printf("Testing thread rce_max\n");
break;
case 12:
mmul = dotProduct_thread_transpose_rce;
printf("Testing thread_transpose rce\n");
break;
case 13:
mmul = dotProduct_thread_transpose_asm_rce;
printf("Testing thread_transpose_asm rce\n");
break;
}
Matrix m1 = newMatrix(size, size);
Matrix m2 = newMatrix(size, size);
for (int i = 0; i < size*size; ++i) {
// Fill the matrices with random numbers. Be sure to avoid dividing by 0.
m1.data[i] = rand()/ ((double)rand()+1);
m2.data[i] = rand()/ ((double)rand()+1);
}
// Set up timers and run
struct timespec tw_begin;
clock_gettime(CLOCK_MONOTONIC, &tw_begin);
struct timespec ts_begin;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts_begin);
Matrix out = mmul(m1, m2);
struct timespec tw_end;
clock_gettime(CLOCK_MONOTONIC, &tw_end);
struct timespec ts_end;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts_end);
double posix_wall = timespecToMs(tw_end) - timespecToMs(tw_begin);
double cpu_time = timespecToMs(ts_end) - timespecToMs(ts_begin);
// Print the result
switch (opt) {
case 1:
printf("rce wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("rce CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 2:
printf("rec wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("rec CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 3:
printf("cre wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("cre CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 4:
printf("cer wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("cer CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 5:
printf("erc wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("erc CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 6:
printf("ecr wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("ecr CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 7:
printf("block wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("block CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 8:
printf("transpose rce wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("transpose rce CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 9:
printf("thread rce wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("thread rce CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 10:
printf("thread cre wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("thread cre CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 11:
printf("thread rce_max wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("thread rce_max CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 12:
printf("thread_transpose rce wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("thread_transpose rce CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
case 13:
printf("thread_transpose_asm rce wall time used for size %i is %.2f ms\n", size, posix_wall);
printf("thread_transpose_asm rce CPU time used for size %i is %.2f ms\n", size, cpu_time);
break;
}
freeMatrix(m1);
freeMatrix(m2);
freeMatrix(out);
return 0;
}
```
---
## Where we started
---
## With transposed matrix
---
## With threads (CPU time)
---
## With threads (wall time)
---
## With SIMD (CPU time)
---
## With SIMD (wall time)
---
## Final comparison (wall time)
---
## Final comparison (wall time)
---
## With gcc's -O2 (wall time)
---
## With gcc's -O2 (wall time)
---
## With gcc's -O2 (wall time)
---
## Moral
* The moral of the story is that optimization is hard, but rewarding!
* We could probably take some of gcc's optimization to squeeze out a little more
* Hardware is generally adapted to allow better implementations
* But you may need to know that those optimizations exist
* And if you ever need to design hardware, you'll have to know all the most common use cases!
* This is why all of the trig functions are supported in hardware
---
## Today's Workloads
* There is a special instruction, called a fused matrix multiply
* Multiply one vector (or matrix) by another and then add a constant
* $w^Tx + b$
* It's used in machine learning
* And, since ML is everywhere, so is that instruction
---
## Result
* You may think that ML is only on GPUs
* But everyone wants to run them
* From your phone, your laptop, to your smart fridge
* Inefficient implementations cost energy and time
* So we are seeing wider support for vector and matrix operations in hardware
---
## Next Topics
* There are two non-CPU architectures that are increasingly important for modern data workflows
* GPUs (Graphics Processing Units)
* TPUs (Tensor Processing Units)
* And we have a quiz next recitation!
---
## Example Questions
* Note, in the initially uploaded version I somehow copied the first question twice and half changed each version, leading to confusion
* The following version is corrected
---
## Example Questions
Which of the following statements about cache misses is always **true**?
a. If two addresses have the same tag, they will not cause a conflict miss.
b. If two addresses map onto different sets in a directly mapped cache, they can still cause a conflict miss.
c. If two addresses map onto the same block, they will cause a conflict miss.
d. None of the above.
---
Which of the following statements about cache misses is always **true**?
a. **If two addresses have the same tag, they will not cause a conflict miss.**
b. If two addresses map onto different sets in a directly mapped cache, they can still cause a conflict miss.
c. If two addresses map onto the same block, they will cause a conflict miss.
d. None of the above.
* For (a), two addresses with the same tag are either: in the same block of memory if the set is the same or in different sets, so they cannot conflict. This is true.
* For (b), two addresses mapped into different sets can never conflict
* For (c), two address that map onto the same block will not conflict. This is a case of locality, and is good, as loading one address fetching the data for the other.
---
In a four-way associative cache, which of the following is definitely true?
a. There are four blocks per line.
b. There are four sets per block.
c. There are four blocks per set.
d. There are four sets in the cache.
---
In a four-way associative cache, which of the following is definitely true?
a. There are four blocks per line.
b. There are four sets per block.
c. **There are four blocks per set.**
d. There are four sets in the cache.
---
A load instruction copies a value from memory into a register. If the L1 hit latency is 4 cycles and the miss penalty if we find our data in the L2 cache is 10 cycles, which of the following is true?
a. If the data is in the L1 cache then it will be available after four cycles have passed.
b. If the data is in the L2 cache, then it must have already been requested at the L1 cache.
c. Data cannot be copied from memory into registers.
d. None of the above.
---
A load instruction copies a value from memory into a register. If the L1 hit latency is 4 cycles and the miss penalty if we find our data in the L2 cache is 10 cycles, which of the following is true?
a. **If the data is in the L1 cache then it will be available after four cycles have passed.**
b. If the data is in the L2 cache, then it must have already been requested at the L1 cache.
c. Data cannot be copied from memory into registers.
d. None of the above.
---
L1 cache stores data using what technology?
a. SRAM
b. DRAM
c. Laser activated magnetic platters.
d. Non-volatile flash
---
L1 cache stores data using what technology?
a. **SRAM**
b. DRAM
c. Laser activated magnetic platters.
d. Non-volatile flash
---
In a cold cache, what is the latency of the instruction
add $rax $rbx
in the ID cycle? Latency means the cycles before it completes. L1 hit latency is 4 cycle, miss penalty to go to L2 is 10 cycles, and the miss penalty for L3 is 40 cycles.
a. 1 cycle.
b. 10 cycles.
c. 40 cycles.
d. Longer than the above options.
---
In a cold cache, what is the latency of the instruction
add $rax $rbx
in the ID cycle? Latency means the cycles before it completes. L1 hit latency is 4 cycle, miss penalty to go to L2 is 10 cycles, and the miss penalty for L3 is 40 cycles.
a. **1 cycle**.
b. 10 cycles.
c. 40 cycles.
d. Longer than the above options.
---
Without threading, Which method is **slowest**?
a. r in the outer loop.
b. r in the middle loop.
c. r in the inner loop.
d. All are the same.
Given
product.rows[r][c] += left.rows[r][e] * right.rows[e][c];
and
for (size_t r = 0; r < product.height; ++r)
for (size_t e = 0; e < num_elem; ++e)
for (size_t c = 0; c < product.width; ++c)
---
Without threading, Which method is **slowest**?
a. r in the outer loop.
b. r in the middle loop.
c. **r in the inner loop.**
d. All are the same.
Given
product.rows[r][c] += left.rows[r][e] * right.rows[e][c];
and
for (size_t r = 0; r < product.height; ++r)
for (size_t e = 0; e < num_elem; ++e)
for (size_t c = 0; c < product.width; ++c)
---
* We are using a 2-way associative cache with block size 2
* Memory address 1010 is requested.
* What are the Tag, Set and Block?
* How will the current state of the cache change?
| Request |
| Tag | Set | Block |
| ? | ? | ? |
| valid | Tag | Block | valid | Tag | Block |
| Set 0 | 1 | 00 | m[0] | m[1] | 1 | 10 | m[8] | m[9] |
| Set 1 | 1 | 01 | m[6] | m[7] | 0 | | | |
---
* We are using a 2-way associative cache with block size 2
* Memory address 1010 is requested.
* What are the Tag, Set and Block?
* How will the current state of the cache change?
| Request |
| Tag | Set | Block |
| 10 | 1 | 0 |
| valid | Tag | Block | valid | Tag | Block |
| Set 0 | 1 | 00 | m[0] | m[1] | 1 | 10 | m[8] | m[9] |
| Set 1 | 1 | 01 | m[6] | m[7] | 1 | 10 | m[10] | m[11] |
---
## Cache Simulator
* There is a nifty online simulator
* [https://courses.cs.washington.edu/courses/cse351/cachesim/](https://courses.cs.washington.edu/courses/cse351/cachesim/)
* There are others, but that one is easy to use
* If the cache is confusing, try out