Sample Final Answers

1. Expressions

QUESTION 1A. Here are eight expressions. Group the expressions into four pairs so that the two expressions in each pair have the same value, and each pair has a different value from every other pair. There is one unique answer that meets these constraints. m has the same type and value everywhere it appears (there’s one unique value for m that meets the problem’s constraints). Assume an x86 machine.

sizeof(&m)
-1
m & -m
m + ~m + 1
16 >> 2
m & ~m
m
1

1—5; 2—7; 3—8; 4—6

1—5 is easy. m + ~m + 1 == m + (-m) == 0, and m & ~m == 0, giving us 3—8. Now what about the others? m & -m is, as we know, either 0 or a power of 2. This eliminates -1, leaving either m or 1. If m & -m matched with m, then the remaining pair would be 1 and -1, which clearly doesn’t work. Thus m & -m matches with 1, and m == -1.

2. Hello program

Recall from the sample midterm questions that Shintaro Tsuji is representing images such as this one in computer memory:

(Zoomed in:)

He stores each image in an array of 16-bit unsigned integers:

uint16_t cute[16];

Row Y of the image is stored in integer cute[Y]. Given this structure, Shintaro can manipulate the image using C. For example, here’s a function.

void swap(void) {
   for (int i = 0; i < 16; ++i)
      cute[i] = (cute[i] << 8) | (cute[i] >> 8);
}

Running swap produces the following image:

Shintaro has written several other functions. Here are some images (A is the original):

File:hello-f0.gif	File:hello-f7.gif	File:hello-f2.gif	File:hello-f3.gif	File:hello-f4.gif
A	B	C	D	E

File:hello-f5.gif	File:hello-f6.gif	File:hello-f1.gif	File:hello-f8.gif	File:hello-f9.gif
F	G	H	I	J

For each function, what image does that function create?

QUESTION 2A.

void f1() {
   for (int i = 0; i < 16; ++i)
      cute[i] = ~cute[i];
}

H. The code flips all bits in the input.

QUESTION 2B.

void f2() {
   for (int i = 0; i < 16; ++i) {
      cute[i] = ((cute[i] >> 1) & 0x5555) | ((cute[i] << 1) & 0xAAAA);
      cute[i] = ((cute[i] >> 2) & 0x3333) | ((cute[i] << 2) & 0xCCCC);
      cute[i] = ((cute[i] >> 4) & 0x0F0F) | ((cute[i] << 4) & 0xF0F0);
      cute[i] =  (cute[i] >> 8)           |  (cute[i] << 8);
   }
}

QUESTION 2C.

void f3() {
   char *x = (char *) cute;
   for (int i = 0; i < 16; ++i)
      x[2*i] = i;
}

For “fun”

The following programs generated the other images. Can you match them with their images?

void f4() {
   for (int i = 0; i < 16; ++i)
      cute[i] &= ~(7 << i);
}

void f5() {
   swap();
   for (int i = 0; i < 16; ++i)
      cute[i] <<= i/4;
   swap();
}

void f6() {
   for (int i = 0; i < 16; ++i)
      cute[i] = -1 * !!(cute[i] & 64);
}

void f7() {
   for (int i = 0; i < 16; ++i) {
      int tmp = cute[15-i];
      cute[15-i] = cute[i];
      cute[i] = tmp;
   }
}

void f8() {
   for (int i = 0; i < 16; ++i)
      cute[i] = cute[i] & -cute[i];
}

void f9() {
   for (int i = 0; i < 16; ++i)
      cute[i] ^= cute[i] ^ cute[i];
}

void f10() {
   for (int i = 0; i < 16; ++i)
      cute[i] = cute[i] ^ 4080;
}

f4—I; f5—B; f6—C; f7—F; f8—G; f9—A; f10—E

3. Data structures

Here are four assembly functions, f1 through f4.

f1:
    movl    4(%esp), %eax
    movl    8(%esp), %ecx
    testl   %ecx, %ecx
    jle .L2
    xorl    %edx, %edx
.L3:
    movl    4(%eax), %eax
    incl    %edx
    cmpl    %ecx, %edx
    jne .L3
.L2:
    movl    (%edx), %eax
    ret

f2:
    movl    8(%esp), %edx
    leal    0(,%edx,4), %ecx
    movl    4(%esp), %eax
    movl    (%eax,%ecx), %eax
    addl    %ecx, %eax
    movl    (%eax), %eax
    ret

f3:
    pushl   %esi
    pushl   %ebx
    movl    12(%esp), %ecx
    movl    16(%esp), %esi
    movl    20(%esp), %eax
    testl   %esi, %esi
    jle .L9
    xorl    %edx, %edx
.L10:
    movl    %eax, %ebx
    andl    $1, %ebx
    movl    4(%ecx,%ebx,4), %ecx
    incl    %edx
    sarl    %eax
    cmpl    %esi, %edx
    jne .L10
.L9:
    movl    (%ecx), %eax
    popl    %ebx
    popl    %esi
    ret

f4:
    movl    8(%esp), %edx
    movl    4(%esp), %eax
    movl    (%eax,%edx,4), %eax
    ret

QUESTION 3A. Each function returns a value loaded from some data structure. Which function uses which data structure?

Array
Array of pointers to arrays
Linked list
Binary tree

Array—f4; Array of pointers to arrays—f2; Linked list—f1; Binary tree—f3

QUESTION 3B. The array data structure is an array of type T. Considering the code for the function that manipulates the array, which of the following types are likely possibilities for T? Circle all that apply.

char
int
unsigned long
unsigned long long
char*
None of the above

int`, `unsigned long`, `char *

4. Disassemble (14 points)

Here’s some assembly produced by compiling a C program with gcc.

.LC1:
    .string "%d %d\n"

    .globl  f
    .type   f, @function
f:
    pushl   %ebp
    movl    $1, %ecx
    movl    %esp, %ebp
    pushl   %edi
    pushl   %esi
    pushl   %ebx
    subl    $12, %esp
.L13:
    movl    $1, %eax
.L9:
    movl    %eax, %ebx
    imull   %eax, %ebx
    movl    %ecx, %esi
    imull   %ecx, %esi
    movl    $1, %edx
.L4:
    movl    %edx, %edi
    imull   %edx, %edi
    addl    %ebx, %edi
    cmpl    %esi, %edi
    je  .L11
    incl    %edx
    cmpl    %eax, %edx
    jle .L4
    incl    %eax
    cmpl    %ecx, %eax
    jle .L9
    incl    %ecx
    jmp .L13
.L11:
    pushl   %ecx
    pushl   %eax
    pushl   %edx
    pushl   $.LC1
    call    printf
    leal    -12(%ebp), %esp
    movl    $1, %eax
    popl    %ebx
    popl    %esi
    popl    %edi
    popl    %ebp
    ret

QUESTION 4A. How many arguments might this function have? Circle all that apply.

0
1
2
3 or more

All

QUESTION 4B. What might this function return? Circle all that apply.

0
1
−1
Its first argument, whatever that argument is
A square number other than 0 or 1
None of the above

Choice #2 (1)

QUESTION 4C. Which callee-saved registers does this function save and restore? Circle all that apply.

%eax
%ebx
%ecx
%edx
%ebp
%esi
%edi
None of the above

%ebx, %ebp, %esi, %edi

QUESTION 4D. This function handles signed integers. If we changed the C source to use unsigned integers instead, which instructions would change? Circle all that apply.

movl
imull
addl
cmpl
je
jge
popl
None of the above

jge

We accepted circled imull or not! Although x86 imull is signed, in fact as used in C it’s equivalent to mull, and gcc does use imull for unsigned multiplication here. From the Intel manuals:

“[These] forms [of imul] may also be used with unsigned operands because the lower half of the product is the same regardless if the operands are signed or unsigned. The CF and OF flags, however, cannot be used to determine if the upper half of the result is non-zero.”

QUESTION 4E. What might this function print? Circle all that apply.

0 0
1 1
3 4
4 5
6 8
None of the above

Choice #3 (3 4) only. The function searches for a solution to x² + y² == z², under the constraint that x ≤ y. When it finds one, it prints x and y and then returns. It always starts from 1 1 and increments x and y one at a time, so it can only print 3 4.

5. Machine programming (20 points)

Intel really messed up this time. They’ve released a processor, the Fartium Core Trio, where every instruction is broken except the ones on this list.


1.	`cmpl %ecx, %edx`
2.	`decl %edx`
3.	`incl %eax`
4.	`je L1`
5.	`jl L2`
6.	`jmp L3`
7.	`movl 4(%esp), %ecx` `[movc]`
8.	`movl 8(%esp), %edx` `[movd]`
9.	`movl (%ecx,%eax,4), %ecx` `[movx]`
10.	`ret`
11.	`xchgl %eax, %ecx`
12.	`xorl %eax, %eax`

(In case you forgot, xchgl swaps two values—here, the values in two registers—without modifying condition codes.)

“So what if it’s buggy,” says Andy Grove; “it can still run programs.” For instance, he argues convincingly that this function:

 void do_nothing(void) {
 }

is implemented correctly by this Fartium instruction sequence:

ret

Your job is to implement more complex functions using only Fartium instructions. Your implementations must have the same semantics as the C functions, but may perform much worse than one might expect. You may leave off arguments and write instruction numbers (#1–12) or instruction names (for mov, use the bracketed abbreviations). Indicate where labels L1–L3 point (if you need them). Assume that on function entry, the stack is set up as on a normal x86.

QUESTION 5A.

 int return_zero(void) {
     return 0;
 }

xorl %eax, %eax; ret. (#12, #10)

%eax has unknown value when a function begins, so we need to clear it.

QUESTION 5B.

 int identity(int a) {
     return a;
 }

movl 4(%esp), %ecx; xchgl %eax, %ecx; ret. (#7, #11, #10)

At function entry, the value on the top of the stack, at (%esp) = 0(%esp), is the return address. Arguments are stored immediately above that, so 4(%esp) is the first argument.

QUESTION 5C.

 void infinite_loop(void) {
     while (1)
         /* do nothing */;
 }

L3: jmp L3. (L3: #6)

QUESTION 5D.

 typedef struct point {
     int x;
     int y;
     int z;
 } point;
 
 int extract_z(point *p) {
     return p->z;
 }

 movl 4(%esp), %ecx
 xorl %eax, %eax
 incl %eax
 incl %eax
 movl (%ecx,%eax,4), %ecx
 xchgl %eax, %ecx
 ret

(#7 #12 #3 #3 #9 #11 #10)

The value we want is located 8 bytes after the p pointer. In x86 assembly, this is written 8(%register_containing_p). Only one Fartium instruction could work, namely #9, movl (%ecx,%eax,4), %ecx. (The other indirect loads use %esp as a base, but we aren’t given an instruction that could modify %esp the way we need to.) This format uses the address %ecx + 4*%eax, so we must load %eax with 2.

So much for the easy ones. Now complete one out of the following 3 questions, or more than one for extra credit. (Question 5G is worth more than the others.)

QUESTION 5E. [Reminder: Complete at least one of 5E–5G for full credit.]

 int add(int a, int b) {
     return a + b;
 }

     movl 4(%esp), %ecx      # %ecx := a
     movl 8(%esp), %edx      # %edx := b
     xorl %eax, %eax         # %eax := 0
     xchgl %eax, %ecx        # now %eax == a and %ecx == 0
 L3: cmpl %ecx, %edx         # compare %edx and %ecx (which is 0)
     je L1                   # "if %edx == 0 goto L1"
     incl %eax               # ++%eax
     decl %edx               # --%edx
     jmp L3
 L1: ret

(#7 #8 #12 #11 L3: #1 #4 #3 #2 #6 L1: #10)

The loop at L3 executes b times, incrementing %eax each time. Here’s morally equivalent C:

 int add(int a, int b) {
     while (b != 0) {
         ++a; --b;
     }
     return a;
 }

This takes a long time if b < 0, but it does work! We saw many other correct answers. Common errors included comparing incrementing a and decrementing b, something like this:

 int add(int a, int b) {
     int result = a;
     while (b < a) {
         ++result; ++a; --b;
     }
     return a;
 }

But in this design a and b can pass each other, and result is incremented half as many times as it should be.

QUESTION 5F. [Reminder: Complete at least one of 5E–5G for full credit.]

 int array_dereference(int *a, int i) {
     return a[i];
 }

     movl 8(%esp), %edx        # %edx := i
     xorl %eax, %eax           # %eax := 0
 L3: xchgl %eax, %ecx
     cmpl %ecx, %edx           # compare %edx and %ecx
     xchgl %eax, %ecx
     je L1                     # "if %eax == i goto L1"
     incl %eax                 # ++%eax
     jmp L3
 L1: movl 4(%esp), %ecx        # %ecx := a
     movl (%ecx,%eax,4), %ecx  # %ecx := a[i]
     xchgl %eax, %ecx
     ret

(#8 #12 L3: #11 #1 #11 #4 #3 #6 L1: #7 #9 #11 #10)

QUESTION 5G. [Reminder: Complete at least one of 5E–5G for full credit.]

 int traverse_array_tree(int *a, int x) {
     int i = 0;
     while (1) {
         if (x == a[i])
             return i;
         else if (x < a[i])
             i = a[i+1];
         else
             i = a[i+2];
     }
 }

(This funky function traverses a binary tree that’s represented as an array of ints. It returns the position of the x argument in this “tree.” For example, given the following array:

 int a[] = {100, 3, 6,  50, 9, 12,  150, 0, 0,  25, 0, 0,  80, 0, 0};

the call traverse_array_tree(a, 100) returns 0, because that’s the position of 100 in a. The call traverse_array_tree(a, 80) first examines position 0; since a[0] == 100 and 80 < 100, it jumps to position a[0+1] == 3; since a[3] == 50 and 80 > 50, it jumps to position a[3+2] == 12; and then it returns 12, since a[12] == 80. The code breaks if x isn’t in the tree; don’t worry about that.)

     movl 8(%esp), %edx        # %edx := x
`      xorl %eax, %eax           # %eax := 0 (holds `i`) `
 L3: movl 4(%esp), %ecx        # %ecx := a
     movl (%ecx,%eax,4), %ecx  # %ecx := a[i]
     cmpl %ecx, %edx           # compare x and a[i]
     je L1                     # "if x == a[i] goto L1"
     jl L2                     # "if x < a[i] goto L2"
 
     incl %eax
     movl 4(%esp), %ecx
     movl (%ecx,%eax,4), %ecx
     xchgl %eax, %ecx          # i := a[i+1]
     jmp L3
 
 L2: incl %eax
     incl %eax
     movl 4(%esp), %ecx
     movl (%ecx,%eax,4), %ecx
     xchgl %eax, %ecx          # i := a[i+2]
     jmp L3
 
 L1: ret                       # return i

(#8 #12 L3: #7 #9 #1 #4 #5 #3 #7 #9 #11 #6 L2: #3 #3 #7 #9 #11 #6 L1: #10)

We accepted solutions that misinterpreted the order of arguments for cmpl. (In AT&T syntax, “cmpl %ecx, %edx” performs the subtraction %edx − %ecx, so after such a comparison, jl will branch if %edx < %ecx.)

6. Virtual memory (15 points)

QUESTION 6A. What is the x86 page size? Circle all that apply.

4096 bytes
64 cache lines
512 words
0x1000 bytes
2¹⁶ bits
None of the above

#1, #2, #4. The cache line size is 64 = 2⁶, and 2⁶×2⁶ = 2¹². The word size is 4; 512×4 = 2048, not 4096. There are 8 bits per byte; 2¹⁶/8 = 2¹³, not 2¹².

The following questions concern the sizes of page tables. Answer the questions in units of pages. For instance, the page directories in WeensyOS (Assignment 3) each contained one level-1 page table page and one level-2 page table page, for a total size of 2 pages per page table.

QUESTION 6B. What is the maximum size (in pages) of an x86 page table?

2¹⁰ page table pages + 1 page directory page = 1025.

Despite the example above, many people misinterpreted this question as including the physical pages referenced by a page directory, and came up with answers like 2²⁰.

QUESTION 6C. What is the minimum size (in pages) of an x86 page table that would allow a process to access 2²² distinct physical addresses?

One.

Whaaat?! Think about a page directory page where one of the entries referred to the page directory page itself, and the other entries referred to different pages. Like this PDP:

Physical Page	Index	(Physical Address)	Contents
`0x1000`	0	`(0x1000)`	`0x1007`
	1	`(0x1004)`	`0x2007`
	2	`(0x1008)`	`0x3007`
	3	`(0x100C)`	`0x4007`
...
	1023	`(0x1FFC)`	`0x400007`

With this page directory in force, the 2²² virtual addresses 0x0 through 0x3FFFFF (which all have PDI 0) access the 2²² distinct physical addresses 0x1000 through 0x400FFF.

The x86 architecture we discussed in class has 32-bit virtual addresses and 32-bit physical addresses. Extensions to the x86 architecture have increased both these limits.

Physical Address Extensions (PAE) allow 32-bit machines to access up to 2⁵² bytes of physical memory (which is about 4000000 GB). That is, virtual addresses are 32 bits, and physical addresses are 52 bits.
The x86-64 architecture evolves the x86 architecture to a 64-bit word size. x86-64 pointers are 64 bits wide instead of 32. However, only 48 of those bits are meaningful: the upper 16 bits of each virtual address are ignored. Thus, virtual addresses are 48 bits. As with PAE, physical addresses are 52 bits.

QUESTION 6D. Which of these two machines would support a higher number of concurrent processes?

x86 with PAE with 100 GB of physical memory.
x86-64 with 20 GB of physical memory.

#1 x86 with PAE. Each concurrent process occupies some space in physical memory, and #1 has more physical memory.

(Real operating systems swap, so either machine could support more processes than fit in virtual memory, but this would cause thrashing. #1 supports more processes before it starts thrashing.)

QUESTION 6E. Which of these two machines would support a higher maximum number of threads per process?

x86 with PAE with 100 GB of physical memory.
x86-64 with 20 GB of physical memory.

#2 x86-64. Each thread in a process needs some address space for its stack, and an x86-64 process address space is much bigger than an x86’s.

7. Cost expressions (15 points)

In the following questions, you will reason about the abstract costs of various operations, using the following tables of constants.

Table of Basic Costs


S	System call overhead (i.e., entering and exiting the kernel)
F	Page fault cost (i.e., entering and exiting the kernel)
P	Cost of allocating a new physical page
M	Cost of installing a new page mapping
B	Cost of copying a byte

Table of Sizes


n_k		Number of memory pages allocated to the kernel
Per-process sizes (defined for each process p)
n_p		Number of memory pages allocated to p
r_p		Number of read-only memory pages allocated to p
w_p	= n_p − r_p	Number of writable memory pages allocated to p
m_p		Number of memory pages actually modified by p after the previous `fork()`

One of our tiny operating systems from class (OS02) included a program that called a recursive function. When the recursive function’s argument grew large enough, the stack pointer moved beyond the memory actually allocated for the stack, and the program crashed.

QUESTION 7A. In our first solution for this problem, the process called the sys_page_alloc(void *addr) system call, which allocated and mapped a single new page at address addr (the new stack page). Write an expression for the cost of this sys_page_alloc() system call in terms of the constants above.

S + P + M

QUESTION 7B. Our second solution for this problem changed the operating system’s page fault handler. When a fault occurred in a process’s stack region, the operating system allocated a new page to cover the corresponding address and restarted the process. Write an expression for the cost of such a fault in terms of the constants above.

F + P + M

QUESTION 7C. Design a revised version of sys_page_alloc that supports batching. Give its signature and describe its behavior.

Example: sys_page_alloc(void *addr, int npages)

QUESTION 7D. Write an expression for the cost of a call to your batching allocation API.

Can vary; for this example, S + npages*(P + M)

In the remaining questions, a process p calls fork(), which creates a child process, c.

Assume that the base cost of performing a fork() system call is Φ. This cost includes the fork() system call overhead (S), the overhead of allocating a new process, the overhead of allocating a new page directory with kernel mappings, and the overhead of copying registers. But it does not include overhead from allocating, copying, or mapping other memory.

QUESTION 7E. Consider the following implementations of fork():


A.	Naive fork: Copy all process memory (Assignment 3, Step 5).
B.	Eager fork: Copy all writable process memory; share read-only process memory, such as code (Assignment 3, Step 6).
C.	Copy-on-write fork: initially share all memory as read-only. Create writable copies later, on demand, in response to write faults (Assignment 3 extra credit).

Which expression best represents the total cost of the fork() system call in process p, for each of these fork implementations? Only consider the system call itself, not later copy-on-write faults.

(Note: Per-process variables, such as n, are defined for each process. So, for example, n_p is the number of pages allocated to the parent process p, and n_c is the number of pages allocated to the child process c.)

Φ
Φ + n_p × M
Φ + (n_p + w_p) × M
Φ + n_p × 2¹² × (B + F)
Φ + n_p × (2¹²B + P + M)
Φ + n_p × (P + M)
Φ + w_p × (2¹²B + P + M)
Φ + n_p × (2¹²B + P + M) − r_p × (2¹²B + P)
Φ + n_p × M + m_c × (P + F)
Φ + n_p × M + m_c × (2¹²B + F + P)
Φ + n_p × M + (m_p+m_c) × (P + F)
Φ + n_p × M + (m_p+m_c) × (2¹²B + F + P)

A: #5, B: #8 (good partial credit for #7), C: #2

QUESTION 7F. When would copy-on-write fork be more efficient than eager fork (meaning that the sum of all fork-related overheads, including faults for pages that were copied on write, would be less for copy-on-write fork than eager fork)? Circle the best answer.

When n_p < n_k.
When w_p × F < w_p × (M + P).
When m_c × (F + M + P) < w_p × (M + P).
When (m_p+m_c) × (F + M + P + 2¹²B) < w_p × (P + 2¹²B).
When (m_p+m_c) × (F + P + 2¹²B) < w_p × (P + M + 2¹²B).
When m_p < m_c.
None of the above.

8. Networking (10 points)

QUESTION 8A. Which of the following system calls should a programmer expect to sometimes block (i.e., to return after significant delay)? Circle all that apply.


1. `socket`		5. `connect`
2. `read`		6. `write`
3. `accept`		7. `usleep`
4. `listen`		8. None of these

#2 read, #3 accept, #5 connect, (#6 write), #7 usleep. (write can definitely block if the reading end hasn’t caught up, but we didn’t emphasize this in class, so no points off.)

QUESTION 8B. Below are seven message sequence diagrams demonstrating the operation of a client–server RPC protocol. A request such as “get(X)” means “fetch the value of the object named X”; the response contains that value. Match each network property or programming strategy below with the diagram with which it best corresponds. You will use every diagram once.


1. Loss	4. Duplication	7. Exponential backoff
2. Delay	5. Batching
3. Reordering	6. Prefetching

#1—B, #2—C, #3—F, #4—D, #5—G, #6—A, #7—E (A—#6, B—#1, C—#2, D—#4, E—#7, F—#3, G—#5)

While G could also represent prefetching, A definitely does not represent batching at the RPC level—each RPC contains one request—so under the rule that each diagram is used once, we must say G is batching and A is prefetching.

Finalfig2012_1.gif	Finalfig2012_2.gif	Finalfig2012_3.gif	Finalfig2012_4.gif
A	B	C	D

Finalfig2012_5.gif	Finalfig2012_6.gif	Finalfig2012_7.gif
E	F	G

9. Threads (15 points)

The following code performs a matrix multiplication, c = ab, where a, b, and c are all square matrices of dimension sz. It uses the cache-friendly ikj index ordering.

 #define MELT(matrix, sz, row, col) (matrix)[(row)*(sz) + (col)]
 
 void matrix_multiply(double *c, const double *a, const double *b, size_t sz) {
     for (size_t i = 0; i < sz; ++i)
         for (size_t j = 0; j < sz; ++j)
             MELT(c, sz, i, j) = 0;
     for (size_t i = 0; i < sz; ++i)
         for (size_t k = 0; k < sz; ++k)
             for (size_t j = 0; j < sz; ++j)
                 MELT(c, sz, i, j) += MELT(a, sz, i, k) * MELT(b, sz, k, j);
 }

But matrix multiplication is a naturally parallelizable problem. Here’s some code that uses threads to multiply even faster on a multicore machine. We use sz parallel threads, one per row of c.

     typedef struct matrix_args {
         double *c;
         const double *a;
         const double *b;
         size_t sz;
         size_t i;
     } matrix_args;
 
     void *matrix_multiply_ikj_thread(void *arg) {
 (α)     matrix_args *m = (matrix_args *) arg;
 (β)     for (size_t j = 0; j < m->sz; ++j)
 (γ)         MELT(m->c, m->sz, m->i, j) = 0;
 (δ)     for (size_t k = 0; k < m->sz; ++k)
 (ε)         for (size_t j = 0; j < m->sz; ++j)
 (ζ)             MELT(m->c, m->sz, m->i, j) += MELT(m->a, m->sz, m->i, k) * MELT(m->b, m->sz, k, j);
 (η)     return NULL;
      }
 
     void matrix_multiply_ikj(double *c, const double *a, const double *b, size_t sz) {
 (1)     pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * sz);
 (2)     for (size_t i = 0; i < sz; ++i) {
 (3)         matrix_args m = { c, a, b, sz, i };
 (4)         int r = pthread_create(&threads[i], NULL, &matrix_multiply_ikj_thread, &m);
 (5)         assert(r == 0);
 (6)     }
 (7)     for (size_t i = 0; i < sz; ++i)
 (8)         pthread_join(threads[i], NULL);
 (9)     free(threads);
     }

But when run, this code gives wildly incorrect results.

QUESTION 9A. What is wrong? Describe why the problem is a synchronization issue.

The matrix_multiply_ikj function starts many threads, each with its own logically different set of matrix_args. But matrix_multiply_ikj allocates these matrix_args structures on the stack! The m variable is initialized on each loop iteration, but then the variable’s stack space is immediately reused for the next loop iteration. It is very likely that during a thread’s execution, its matrix_args have been replaced by other arguments. This is a synchronization issue because the code would work correctly if matrix_multiply_ikj_thread was called serially, rather than concurrently. The matrix_multiply_ikj_thread function must synchronize with matrix_multiply_ikj’s reuse of m.

QUESTION 9B. Write C code showing how the problem could be fixed with changes only to matrix_multiply_ikj. Refer to the numbered lines to indicate replacements and/or insertions. Use one or more additional heap allocations and no additional calls to pthread functions. Free any memory you allocate once it is safe to do so.

There are many solutions, but here’s one: we place each thread’s arguments in different, heap-allocated memory. This solves the synchronization issue by eliminating the memory reuse. New and changed lines are marked with ***.

     void matrix_multiply_ikj(double *c, const double *a, const double *b, size_t sz) {
 (1)     pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * sz);
   *     matrix_args *marr = (matrix_args *) malloc(sizeof(matrix_args) * sz);         
 (2)     for (size_t i = 0; i < sz; ++i) {
 (3)         matrix_args m = { c, a, b, sz, i };
   *         marr[i] = m;
   *         int r = pthread_create(&threads[i], NULL, &matrix_multiply_ikj_thread,
   *                                &marr[i]);
 (5)         assert(r == 0);
 (6)     }
 (7)     for (size_t i = 0; i < sz; ++i)
 (8)         pthread_join(threads[i], NULL);
 (9)     free(threads);
   *     free(marr);
     }

On single-core machines, the kij order performs almost as fast as the ikj order. Here’s a version of the parallel matrix multiplication code that uses kij.

     typedef struct matrix_args_kij {
         double *c;
         const double *a;
         const double *b;
         size_t sz;
         size_t k;
     } matrix_args_kij;
 
     void *matrix_multiply_kij_thread(void *arg) {
 (α)     matrix_args_kij *m = (matrix_args_kij *) arg;
 (β)     for (size_t i = 0; i < m->sz; ++i)
 (γ)         for (size_t j = 0; j < m->sz; ++j)
 (δ)             MELT(m->c, m->sz, i, j) += MELT(m->a, m->sz, i, m->k) * MELT(m->b, m->sz, m->k, j);
 (ε)     return NULL;
      }
 
     void matrix_multiply_kij(double *c, const double *a, const double *b, size_t sz) {
 (1)     pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * sz);
 (2)     for (size_t i = 0; i < sz; ++i)
 (3)         for (size_t j = 0; j < sz; ++j)
 (4)             MELT(c, sz, i, j) = 0;
 (5)     for (size_t k = 0; k < sz; ++k) {
 (6)         matrix_args_kij m = { c, a, b, sz, k };
 (7)         int r = pthread_create(&threads[k], NULL, &matrix_multiply_kij_thread, &m);
 (8)         assert(r == 0);
 (9)     }
(10)     for (size_t k = 0; k < sz; ++k)
(11)         pthread_join(threads[k], NULL);
(12)     free(threads);
     }

This problem has the same problem as the previous version, plus another problem. Even after your fix from 8A–8B is applied, this version produces incorrect results.

QUESTION 9C. What is the new problem? Describe why it is a synchronization issue.

The new problem is that now, different matrix_multiply_kij_thread functions might try to modify the same matrix element at the same time. This is a synchronization issue because the concurrent access to a matrix element might cause some updates to get lost. In the previous version, each matrix_multiply_ikj_thread thread worked on a different matrix row, so no synchronization was required.

QUESTION 9D. Write pseudocode or C code that fixes this problem. You should refer to pthread functions. For full credit your solution should have low contention.

The simplest solutions introduce mutual exclusion locking. This means different threads can’t modify the same matrix element simultaneously. To reduce contention, the locking should be fine-grained—for instance, there could be one lock per matrix element. But other tradeoffs are possible; one lock per matrix element is a lot; you could instead have one lock per matrix row or column.

Here’s working code, including the fix for the matrix_args reuse problem. (We didn’t require working code.)

     typedef struct matrix_args_kij {
         double *c;
         const double *a;
         const double *b;
  *      pthread_mutex_t *locks;
         size_t sz;
         size_t k;
     } matrix_args;
 
     void *matrix_multiply_kij_thread(void *arg) {
         matrix_args_kij *m = (matrix_args_kij *) arg;
         for (size_t i = 0; i < m->sz; ++i)
             for (size_t j = 0; j < m->sz; ++j) {
  *              pthread_mutex_lock(&m->locks[i * m->sz + j]);
                 MELT(m->c, m->sz, i, j) += MELT(m->a, m->sz, i, m->k) * MELT(m->b, m->sz, m->k, j);
  *              pthread_mutex_unlock(&m->locks[i * m->sz + j]);
             }
         return NULL;
      }
 
     void matrix_multiply_kij(double *c, const double *a, const double *b, size_t sz) {
         pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * sz);
  *      matrix_args_kij *marr = (matrix_args_kij *) malloc(sizeof(matrix_args_kij) * sz);
  *      pthread_mutex_t *locks = (pthread_mutex_t *) malloc(sizeof(pthread_mutex_t) * sz * sz);
         for (size_t i = 0; i < sz; ++i)
             for (size_t j = 0; j < sz; ++j) {
                 MELT(c, sz, i, j) = 0;
  *              pthread_mutex_init(&locks[i * sz + j], NULL);
             }
         for (size_t k = 0; k < sz; ++k) {
  *          matrix_args_kij m = { c, a, b, locks, sz, k };
  *          marr[k] = m;
  *          int r = pthread_create(&threads[k], NULL, &matrix_multiply_kij_thread,
                                    &marr[k]);
             assert(r == 0);
         }
         for (size_t k = 0; k < sz; ++k)
             pthread_join(threads[k], NULL);
         free(threads);
  *      free(marr);
  *      free(locks);
     }