diff --git a/openmp/01_openmp-demo.ipynb b/openmp/01_openmp-demo.ipynb new file mode 100644 index 0000000..031dc69 --- /dev/null +++ b/openmp/01_openmp-demo.ipynb @@ -0,0 +1,317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2bf44eba-b903-40f1-9ef6-67fb8e2d9cc8", + "metadata": {}, + "source": [ + "# Introduction to OpenMP\n", + "\n", + "OpenMP (Open Multi-Processing) allows you to use multiple threads of the CPU when programming. It is an industry-standard API for shared-memory parallel programming, designed to help C++ developers take full advantage of multi-core processors without needing to manage low-level thread creation.\n", + "\n", + "## How it works\n", + "\n", + "1. The Fork-Join Model\n", + "OpenMP operates on a simple execution pattern:\n", + "\n", + "- The Master Thread: The program begins as a single serial thread.\n", + "- The Fork: When a parallel directive is reached, the master thread creates a team of worker threads.\n", + "- Parallel Execution: The work is distributed across these threads.\n", + "- The Join: Once the work is finished, the threads synchronize and terminate, leaving only the master thread to continue.\n", + "\n", + "2. Practical Implementation\n", + "- The most common use case is parallelising a for loop. By adding a single \"pragma\" line, you can distribute millions of calculations across your CPU cores.\n", + "\n", + "Here is an image of how the Fork-Join model works\n", + "\n", + "![fork-join](images/fork_join.png)" + ] + }, + { + "cell_type": "markdown", + "id": "0fb794c1-2999-43ae-9c9d-58060e4eb66f", + "metadata": {}, + "source": [ + "In this first example function we can see how to print Hello World normally. There is only standard c++ syntax." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b0c15570-ee24-42ed-b61f-11a3fc858b2d", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5001e441-1fa5-4bdc-9fa5-2ca103ae484f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello World!\n" + ] + } + ], + "source": [ + "void example1() {\n", + " std::cout << \"Hello World!\" << std::endl;\n", + "}\n", + "example1();" + ] + }, + { + "cell_type": "markdown", + "id": "f37a7cfd-f158-4ef9-a4cf-977296ccf5d4", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Now, let's use OpenMP to run the same code in parallel. By adding a simple compiler directive, we tell the system to create a \"team\" of threads.\n", + "\n", + "```#pragma omp parallel``` This directive tells the compiler to execute the following block of code in parallel using multiple threads.\n", + "\n", + "The result is that instead of printing once, you will see \"Hello World!\" printed multiple times—once for every available logical core on your CPU." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "53fb7656-b72e-42bc-ade7-2ae2077142da", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello World!\n", + "Hello World!\n", + "Hello World!\n", + "Hello World!\n", + "Hello World!\n", + "Hello World!\n", + "Hello World!Hello World!\n", + "\n" + ] + } + ], + "source": [ + "void example2() {\n", + " #pragma omp parallel\n", + " {\n", + " std::cout << \"Hello World!\" << std::endl;\n", + " }\n", + "}\n", + "example2();" + ] + }, + { + "cell_type": "markdown", + "id": "f1568cd4-d407-49c7-98e4-26f971fdb63d", + "metadata": {}, + "source": [ + "---\n", + "\n", + "There is one thing we need to address before we continue\n", + "\n", + "When we use #pragma omp parallel, OpenMP spawns multiple threads that all execute the code inside the curly braces simultaneously. Every thread is trying to talk to our monitor at the exact same time.\n", + "\n", + "Thread A might finish sending \"Hello World!\" but, before it can send the newline, Thread B jumps in and prints its own \"Hello World!\". This results in two greetings on one line, followed by two newlines later.\n", + "\n", + "We fix this by using this by using ```#pragma omp critical```. It makes each thread wait for each other. This significantly slows down the program and loses the whole point of using multiple threads, but we are going to use it here in some examples to show you better how OpenMP works. Below we have a function that uses this fix." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "efcdfdb6-a60b-46af-8194-75ef9cc0e27f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello World! (0)\n", + "Hello World! (5)\n", + "Hello World! (6)\n", + "Hello World! (4)\n", + "Hello World! (1)\n", + "Hello World! (3)\n", + "Hello World! (7)\n", + "Hello World! (2)\n" + ] + } + ], + "source": [ + "void example3() {\n", + " #pragma omp parallel\n", + " {\n", + " #pragma omp critical\n", + " {\n", + " std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + " }\n", + " }\n", + "}\n", + "example3();" + ] + }, + { + "cell_type": "markdown", + "id": "a6123ee3-92b9-42fe-b1a0-57f9764569aa", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Other things we can do in OMP, includes using a set number of threads that are going to be used: ```#pragma omp parallel num_threads(2)``` " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d86a9efa-ba28-4cb6-bbfc-abc00ee63506", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello World! (0)\n", + "Hello World! (6)\n", + "Hello World! (5)\n", + "Hello World! (3)\n", + "Hello World! (4)\n", + "Hello World! (1)\n", + "Hello World! (7)\n", + "Hello World! (2)\n", + "This is another message! (0)\n", + "Goodbye World! (Goodbye World! (01)\n", + ")\n" + ] + } + ], + "source": [ + "void example4() {\n", + " #pragma omp parallel\n", + " {\n", + " #pragma omp critical\n", + " {\n", + " std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + " }\n", + " }\n", + "\n", + " std::cout << \"This is another message! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + "\n", + " #pragma omp parallel num_threads(2)\n", + " {\n", + " std::cout << \"Goodbye World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + " }\n", + "}\n", + "example4();" + ] + }, + { + "cell_type": "markdown", + "id": "25421d19-67e5-4bc9-9420-d96439cffba5", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Now that you know basic syntax we can do a speed benchmark. We are going to time filling two arrays with 1 million elements each, adding them and calculating the average. In the average calculation there is a problem that can occur. In parallel programming, if multiple threads try to add to the same average variable at the same time, they will overwrite each other. When we use ```reduction(+:average)```, OpenMP does the following:\n", + "\n", + "- Each thread gets its own private mini-average variable initialized to 0.\n", + "- Each thread calculates the sum for its assigned chunk of the array (e.g., Thread 1 sums indices 0 to 1000, Thread 2 sums 1001 to 2000).\n", + "- Once all threads finish, OpenMP safely adds all those private mini-sums together into the final global average variable." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5557e01a-7c7d-4b54-8545-962ad11027df", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialize a[] time: 0.657814\n", + "Initialize b[] time: 0.856986\n", + "Add arrays time: 0.581521\n", + "Average result time: 0.449791\n", + "Average: 5e+08\n", + "Total time: 2.54675\n" + ] + } + ], + "source": [ + "void example5() {\n", + " double start_time = omp_get_wtime();\n", + " double start_loop;\n", + " \n", + " const int N = 1000000000;\n", + " int* a = new int[N];\n", + " int* b = new int[N];\n", + " \n", + " start_loop = omp_get_wtime();\n", + " #pragma omp parallel for\n", + " for (int i=0; i\n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9078ac79-ca50-4fef-b785-37f35fec3cab", + "metadata": {}, + "outputs": [], + "source": [ + "static long num_steps = 100000000;\n", + "double step;" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3bee0992-3a6d-4da1-9020-3b6c153ecd0a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Num threads available: 8\n" + ] + } + ], + "source": [ + "int i, j, num_threads_allocated;\n", + "double pi, sum = 0.0;\n", + "double start_time, run_time;\n", + "\n", + "step = 1.0 / (double)num_steps;\n", + "printf(\"Num threads available: %d\\n\", omp_get_max_threads());" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9c3baec7-f369-4e28-bcdb-fd1fc7f985fc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Num threads allocated for this run: 1\n", + "pi is 3.141593 in 1.106737 seconds using 1 threads\n", + "\n", + "Num threads allocated for this run: 2\n", + "pi is 3.141593 in 0.551873 seconds using 2 threads\n", + "\n", + "Num threads allocated for this run: 3\n", + "pi is 3.141593 in 0.368655 seconds using 3 threads\n", + "\n", + "Num threads allocated for this run: 4\n", + "pi is 3.141593 in 0.289997 seconds using 4 threads\n", + "\n" + ] + } + ], + "source": [ + "for (i = 1; i <= 4; i++) {\n", + " sum = 0.0;\n", + " omp_set_num_threads(i);\n", + " start_time = omp_get_wtime();\n", + " #pragma omp parallel\n", + " {\n", + " double x;\n", + " num_threads_allocated = omp_get_num_threads();\n", + " \n", + " #pragma omp single\n", + " printf(\"Num threads allocated for this run: %d\\n\", num_threads_allocated);\n", + " \n", + " #pragma omp for reduction(+ : sum)\n", + " for (j = 1; j <= num_steps; j++) {\n", + " x = (j - 0.5) * step;\n", + " sum = sum + 4.0 / (1.0 + x * x);\n", + " }\n", + " }\n", + " pi = step * sum;\n", + " run_time = omp_get_wtime() - start_time;\n", + " printf(\"pi is %f in %f seconds using %d threads\\n\\n\", pi, run_time, num_threads_allocated);\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++23 + OpenMP", + "language": "cpp", + "name": "xcpp23-omp" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "C++", + "nbconvert_exporter": "", + "pygments_lexer": "", + "version": "cxx23" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openmp/04_mandel.ipynb b/openmp/04_mandel.ipynb new file mode 100644 index 0000000..eb4aba8 --- /dev/null +++ b/openmp/04_mandel.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7c545848-70df-4ec4-bea8-93969fc96aa9", + "metadata": {}, + "source": [ + "# Mandelbrot set\n", + "\n", + "This notebook estimates the area of the Mandelbrot set using parallel processing with OpenMP.\n", + "\n", + "### How the Mandelbrot Set Works\n", + "\n", + "1. **Grid Mapping**\n", + " - The grid indices are mapped to complex numbers within a specific range.\n", + " - For each point `c`, we check if iterating the function $z_{n+1} = z_n² + c$ remains bounded.\n", + "\n", + "2. **Iteration Process**\n", + " - Starting with `z = 0`, compute subsequent values using the formula.\n", + " - If at any iteration, the magnitude of `z` exceeds a threshold (here, 2), the point is marked as escaping and not part of the set.\n", + "\n", + "3. **Counting Escaping Points**\n", + " - `numoutside` keeps track of how many points escape within the given iterations.\n", + " - The area estimate is based on the proportion of these points within the grid.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5059dbdd-821d-498a-8716-eb0fcf8a8f5f", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "#include " + ] + }, + { + "cell_type": "markdown", + "id": "f86e5fd6-f657-4848-8f80-780b5ca67678", + "metadata": {}, + "source": [ + "---\n", + "1. **Constants Definition**:\n", + " - `NPOINTS = 5000`: The grid size, creating a 5000x5000 grid.\n", + " - `MAXITER = 1000`: Maximum iterations per point to determine if it escapes." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8b66f96a-14ef-4f23-8024-bcfc42b31e4e", + "metadata": {}, + "outputs": [], + "source": [ + "#define NPOINTS 5000\n", + "#define MAXITER 1000\n", + "\n", + "int numoutside = 0;" + ] + }, + { + "cell_type": "markdown", + "id": "dfd58cbd-798b-472c-869e-5a9de42a502e", + "metadata": {}, + "source": [ + "---\n", + "2. Function testpoint:\n", + " - Takes complex number parameters `creal` and `cimag`.\n", + " - Initializes `zreal` and `zimag` to 0.\n", + " - Iterates up to `MAXITER`, checking if `(zreal^2 + zimag^2) > 4.0`. If so, the point escapes, incrementing `numoutside`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5c35c479-2f79-46b7-bc66-24be6b1694e0", + "metadata": {}, + "outputs": [], + "source": [ + "void testpoint(double creal, double cimag) {\n", + " double zreal = 0.0; // Start at 0\n", + " double zimag = 0.0; \n", + " double r2, i2;\n", + " int iter;\n", + "\n", + " for (iter = 0; iter < MAXITER; iter++) {\n", + " r2 = zreal * zreal;\n", + " i2 = zimag * zimag;\n", + " \n", + " if ((r2 + i2) > 4.0) {\n", + " #pragma omp atomic // Safety check if not using reduction correctly\n", + " numoutside++;\n", + " return;\n", + " }\n", + " \n", + " zimag = 2.0 * zreal * zimag + cimag;\n", + " zreal = r2 - i2 + creal;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "53fb8c2a-5b6e-41bd-a6eb-d31ab6c94ad0", + "metadata": {}, + "source": [ + "---\n", + "\n", + "3. **Main Function**\n", + " - Maps grid indices to complex numbers within a specific range using scaling factors:\n", + " ```\n", + " creal = (i * (-1.0 / ((double)(NPOINTS - 1)))) + (-1.0);\n", + " cimag = (j * (-1.0 / ((double)(NPOINTS - 1)))) + (-0.75);\n", + " ```\n", + " - Uses `#pragma omp parallel for` to distribute the computation across threads.\n", + " - `default(shared)` means variables not specified are shared among all threads.\n", + " - `private(i, j, creal, cimag)` ensures each thread has its own copy of these variables.\n", + " - `reduction(+:numoutside)` accumulates the total count of escaping points.\n", + "\n", + "---\n", + "3. **Area Calculation**:\n", + " - Computes area as `(2.0 * 2.5 * 1.125) * ((NPOINTS^2 - numoutside) / NPOINTS^2)`\n", + " - Error estimate is derived by dividing area by `NPOINTS`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ea116fef-7d05-4e29-97a1-55c85c7241d8", + "metadata": {}, + "outputs": [], + "source": [ + "int main() {\n", + " int i, j;\n", + " double area, error, eps = 1.0e-5;\n", + " double cimag, creal;\n", + " // Loop over grid of points in the complex plane which contains the Mandelbrot set,\n", + " // testing each point to see whether it is inside or outside the set.\n", + "\n", + "// i, j, creal, and cimag MUST be private so threads don't overwrite each other\n", + "#pragma omp parallel for default(shared) private(i, j, creal, cimag) reduction(+:numoutside)\n", + "for (i = 0; i < NPOINTS; i++) {\n", + " for (j = 0; j < NPOINTS; j++) {\n", + " creal = -2.0 + 2.5 * (double)(i) / (double)(NPOINTS) + eps;\n", + " cimag = 1.125 * (double)(j) / (double)(NPOINTS) + eps;\n", + " testpoint(creal, cimag);\n", + " }\n", + "}\n", + "\n", + " // Calculate area of set and error estimate and output the results\n", + " area = 2.0 * 2.5 * 1.125 * (double) (NPOINTS * NPOINTS - numoutside) / (double) (NPOINTS * NPOINTS);\n", + " error = area / (double) NPOINTS;\n", + "\n", + " printf(\"Area of Mandlebrot set = %12.8f +/- %12.8f\\n\", area, error);\n", + " printf(\"Correct answer should be around 1.510659\\n\");\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "39cf129c-8106-4e67-a2f1-1a7fff17cd38", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Area of Mandlebrot set = 1.51064145 +/- 0.00030213\n", + "Correct answer should be around 1.510659\n" + ] + } + ], + "source": [ + "main();" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++23 + OpenMP", + "language": "cpp", + "name": "xcpp23-omp" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "C++", + "nbconvert_exporter": "", + "pygments_lexer": "", + "version": "cxx23" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openmp/linked_list.ipynb b/openmp/05_linked_list.ipynb similarity index 60% rename from openmp/linked_list.ipynb rename to openmp/05_linked_list.ipynb index 005380b..fb4a337 100644 --- a/openmp/linked_list.ipynb +++ b/openmp/05_linked_list.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "4de82d38-88d2-4d56-8ed1-3a33cdd4dac1", + "metadata": {}, + "source": [ + "# Linked List\n", + "\n", + "The program builds a linked list of 6 nodes, each storing a Fibonacci input (38 through 43), then uses parallel programming to compute the Fibonacci value for every node in parallel. A single thread walks the list and spawns one task per node, each task calling the slow recursive `fib()` on its own copy of the pointer `firstprivate(p)`. Once all tasks finish, the results are printed and memory is freed.\n", + "\n", + "### What is a linked list?\n", + "\n", + "A linked list is a chain of nodes where each node holds some data and a pointer to the next node. In this code, each node stores a Fibonacci input (data), its result (fibdata), and a next pointer to the following node. The last node's next is NULL, signaling the end of the chain. Instead of storing elements in contiguous memory like an array, each node lives at a random location in the heap (via malloc) and they are connected only through those next pointers - so to reach node 4 you must walk through nodes 1, 2, and 3 in order." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -29,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "22f97c49-78d1-496e-ac7c-978aed95331a", "metadata": {}, "outputs": [], @@ -41,9 +55,17 @@ "};" ] }, + { + "cell_type": "markdown", + "id": "afc7ee35-8fd1-45de-be44-f987d98b7808", + "metadata": {}, + "source": [ + "Here we can see the 3 main functions used for this program. " + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "b16b1e8a-8831-4b8d-9d57-09deeaaa88ee", "metadata": {}, "outputs": [], @@ -53,9 +75,19 @@ "int fib(int n);" ] }, + { + "cell_type": "markdown", + "id": "10231105-2ab9-44e9-9878-24d128a473c6", + "metadata": {}, + "source": [ + "### Fibonachi function\n", + "\n", + "`int fib(int n)`: recursively computes the N-th Fibonacci number by breaking it down into fib(n-1) + fib(n-2) until it hits the base cases 0 or 1, then returns the final integer result." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "0ef8af6c-1d6f-4c68-84bc-3dd1d8092b06", "metadata": {}, "outputs": [], @@ -72,9 +104,19 @@ "}" ] }, + { + "cell_type": "markdown", + "id": "a13acd85-15f5-4ac7-9312-5288cd8ad598", + "metadata": {}, + "source": [ + "### Process work\n", + "\n", + "`void processwork(struct node *p)`: takes a single node, computes the Fibonacci number of its data field by calling fib(), and stores the result in its fibdata field. Returns nothing." + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "1fa0307d-fdc9-4503-95cb-1c6448791354", "metadata": {}, "outputs": [], @@ -88,9 +130,19 @@ "}" ] }, + { + "cell_type": "markdown", + "id": "a35c79a7-a72e-4fa8-81ba-6bdc0f18e0a9", + "metadata": {}, + "source": [ + "### initialize list\n", + "\n", + "`struct node *init_list(struct node *p)`: builds the linked list in memory using malloc, creates N+1 nodes with Fibonacci inputs starting from FS, links them together, and returns a pointer to the head (first node)." + ] + }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "03acb599-9329-49ff-8aff-c0902adb6c3c", "metadata": {}, "outputs": [], @@ -117,9 +169,19 @@ "}" ] }, + { + "cell_type": "markdown", + "id": "111c0742-1c27-40ad-a261-7ed85e379d10", + "metadata": {}, + "source": [ + "## The main code\n", + "\n", + "`main` prints some info messages, sets OpenMP to use the maximum available threads, builds the linked list via `init_list`, then starts a timer. Inside the parallel region, one thread walks the list and creates an independent OpenMP task for each node, while all 8 threads grab and execute those tasks concurrently — each computing its Fibonacci number via `processwork`. Once all tasks finish, the timer stops, the results are printed node by node, memory is freed, and the total compute time is displayed." + ] + }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "f2dfb41b-e55f-43c0-b7f6-546a1697acb1", "metadata": {}, "outputs": [], @@ -177,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "353e5dfd-fcae-43e6-97e3-ec98070811a1", "metadata": {}, "outputs": [ @@ -195,7 +257,7 @@ "41 : 165580141\n", "42 : 267914296\n", "43 : 433494437\n", - "Compute Time: 2.617225 seconds\n" + "Compute Time: 4.668121 seconds\n" ] } ], @@ -206,7 +268,7 @@ ], "metadata": { "kernelspec": { - "display_name": "C++23 (xcpp+OpenMP)", + "display_name": "C++23 + OpenMP", "language": "cpp", "name": "xcpp23-omp" }, @@ -215,7 +277,9 @@ "file_extension": ".cpp", "mimetype": "text/x-c++src", "name": "C++", - "version": "23" + "nbconvert_exporter": "", + "pygments_lexer": "", + "version": "cxx23" } }, "nbformat": 4, diff --git a/openmp/06_race_conditions.ipynb b/openmp/06_race_conditions.ipynb new file mode 100644 index 0000000..9c898a4 --- /dev/null +++ b/openmp/06_race_conditions.ipynb @@ -0,0 +1,513 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc192d05", + "metadata": {}, + "source": [ + "# Race Conditions\n", + "\n", + "A **race condition** occurs when two or more threads access shared memory simultaneously, and at least one of them is writing. The result depends on the unpredictable order of execution - making bugs hard to reproduce and debug.\n", + "\n", + "---\n", + "## 1. The Classic Race Condition - Parallel Counter\n", + "\n", + "We want to increment a shared counter 1 000 000 times using multiple threads.\n", + "Intuitively `counter++` looks like one action, but it compiles to **three** instructions:\n", + "\n", + "```\n", + "LOAD reg ← counter ; read current value\n", + "ADD reg, 1 ; increment in register\n", + "STORE counter ← reg ; write back\n", + "```\n", + "\n", + "If two threads execute these steps concurrently, one write will silently overwrite the other - an **increment is lost**." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3e2e951e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Run 1: counter = 458255 (expected 1000000, lost 541745)\n", + "Run 2: counter = 340448 (expected 1000000, lost 659552)\n", + "Run 3: counter = 367539 (expected 1000000, lost 632461)\n", + "Run 4: counter = 367803 (expected 1000000, lost 632197)\n", + "Run 5: counter = 405751 (expected 1000000, lost 594249)\n" + ] + } + ], + "source": [ + "#include \n", + "#include \n", + "#define N 1000000\n", + "\n", + "void example1()\n", + "{\n", + " int counter = 0;\n", + " \n", + " // Run the experiment 5 times to show non-determinism\n", + " for (int run = 0; run < 5; run++) {\n", + " counter = 0;\n", + " #pragma omp parallel for num_threads(4)\n", + " {\n", + " for (int i = 0; i < N; i++) {\n", + " counter++; // RACE CONDITION: no protection\n", + " }\n", + " }\n", + " printf(\"Run %d: counter = %d (expected %d, lost %d)\\n\",\n", + " run+1, counter, N, N - counter);\n", + " }\n", + "}\n", + "\n", + "example1();" + ] + }, + { + "cell_type": "markdown", + "id": "6ab1f67b", + "metadata": {}, + "source": [ + "### What you should observe\n", + "\n", + "- The counter is **almost never** 1 000 000.\n", + "- Each run gives a **different** wrong answer - the race is truly non-deterministic.\n", + "\n", + "> **Key insight**: You cannot rely on \"it worked in my tests\" for racy code. On a lightly loaded machine you might get lucky; under production load the bug emerges." + ] + }, + { + "cell_type": "markdown", + "id": "8d67c01b", + "metadata": {}, + "source": [ + "---\n", + "## 2. Fix #1 - `#pragma omp critical`\n", + "\n", + "`critical` creates a **named mutex**: only one thread at a time may execute the enclosed block. It is the most general but also the most expensive fix." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bb9dac12", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "critical: counter = 1000000, time = 0.1604 s\n" + ] + } + ], + "source": [ + "#include\n", + "\n", + "void example2() {\n", + " int counter = 0;\n", + " double t0 = omp_get_wtime();\n", + "\n", + " #pragma omp parallel for num_threads(4)\n", + " for (int i = 0; i < N; i++) {\n", + " #pragma omp critical\n", + " {\n", + " counter++;\n", + " }\n", + " }\n", + "\n", + " double elapsed = omp_get_wtime() - t0;\n", + " printf(\"critical: counter = %d, time = %.4f s\\n\", counter, elapsed);\n", + "}\n", + "\n", + "example2();" + ] + }, + { + "cell_type": "markdown", + "id": "ec744b0d", + "metadata": {}, + "source": [ + "The counter is now correct. But notice: with `critical`, threads spend most of their time **queuing at the mutex** rather than doing work. This makes parallel code *slower* than sequential for fine-grained operations.\n", + "\n", + "**When to use `critical`**: When the protected block is non-trivial (multiple statements, complex logic) and cannot be expressed as a single atomic operation." + ] + }, + { + "cell_type": "markdown", + "id": "8762443d", + "metadata": {}, + "source": [ + "---\n", + "## 3. Fix #2 - `#pragma omp atomic`\n", + "\n", + "`atomic` maps directly to a hardware atomic instruction (e.g., `LOCK XADD` on x86). It is **much faster** than `critical` for simple read-modify-write operations." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8358fa20", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "atomic: counter = 1000000, time = 0.0168 s\n" + ] + } + ], + "source": [ + "void example3() {\n", + " int counter = 0;\n", + " double t0 = omp_get_wtime();\n", + "\n", + " #pragma omp parallel for num_threads(4)\n", + " for (int i = 0; i < N; i++) {\n", + " #pragma omp atomic\n", + " counter++; // maps to a single atomic hardware instruction\n", + " }\n", + "\n", + " double elapsed = omp_get_wtime() - t0;\n", + " printf(\"atomic: counter = %d, time = %.4f s\\n\", counter, elapsed);\n", + "}\n", + "\n", + "example3();" + ] + }, + { + "cell_type": "markdown", + "id": "6c34b191", + "metadata": {}, + "source": [ + "### `atomic` clauses (OpenMP 3.1+)\n", + "\n", + "| Clause | Effect |\n", + "|--------|--------|\n", + "| `atomic update` (default) | `x++`, `x--`, `x += expr`, etc. |\n", + "| `atomic read` | `v = x` atomically |\n", + "| `atomic write` | `x = expr` atomically |\n", + "| `atomic capture` | `v = x; x op= expr` - read + update together |\n", + "\n", + "**Limitation**: `atomic` only works for a *single* variable with a *single* supported operator. Use `critical` for anything more complex." + ] + }, + { + "cell_type": "markdown", + "id": "ce4969eb", + "metadata": {}, + "source": [ + "---\n", + "## 4. Fix #3 - `reduction` clause (the fastest approach)\n", + "\n", + "The `reduction` clause is the idiomatic OpenMP solution for accumulation. Each thread gets a **private copy** of the variable, works independently (no synchronisation!), and OpenMP merges the results at the end of the parallel region.\n", + "\n", + "This is a **divide-and-conquer** strategy: eliminate the shared state entirely during the parallel phase." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cf2a0336", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reduction: counter = 1000000, time = 0.0005 s\n" + ] + } + ], + "source": [ + "void example4() {\n", + " int counter = 0;\n", + " double t0 = omp_get_wtime();\n", + "\n", + " #pragma omp parallel for num_threads(4) reduction(+:counter)\n", + " for (int i = 0; i < N; i++) {\n", + " counter++; // each thread increments its own private copy\n", + " }\n", + "\n", + " double elapsed = omp_get_wtime() - t0;\n", + " printf(\"reduction: counter = %d, time = %.4f s\\n\", counter, elapsed);\n", + "}\n", + "\n", + "example4();" + ] + }, + { + "cell_type": "markdown", + "id": "e1b99804", + "metadata": {}, + "source": [ + "### Supported reduction operators\n", + "\n", + "| Operator | Initial value | Use case |\n", + "|----------|--------------|----------|\n", + "| `+` | 0 | Sum |\n", + "| `*` | 1 | Product |\n", + "| `-` | 0 | Subtraction accumulation |\n", + "| `min` | type max | Minimum value |\n", + "| `max` | type min | Maximum value |\n", + "| `&` | ~0 | Bitwise AND |\n", + "| `\\|` | 0 | Bitwise OR |\n", + "| `^` | 0 | Bitwise XOR |\n", + "| `&&` | 1 | Logical AND |\n", + "| `\\|\\|` | 0 | Logical OR |" + ] + }, + { + "cell_type": "markdown", + "id": "572cbe98", + "metadata": {}, + "source": [ + "---\n", + "## 5. Benchmark - Compare All Approaches" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f67d05a5", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sequential: 0.001751 s\n", + "critical: 0.165203 s\n", + "atomic: 0.019443 s\n", + "reduction: 0.000474 s\n" + ] + } + ], + "source": [ + "#define THREADS 4\n", + "#define RUNS 5\n", + "\n", + "double bench_sequential() {\n", + " long long c = 0;\n", + " double t0 = omp_get_wtime();\n", + " for (int i = 0; i < N; i++) c++;\n", + " return omp_get_wtime() - t0;\n", + "}\n", + "\n", + "double bench_critical() {\n", + " long long c = 0;\n", + " double t0 = omp_get_wtime();\n", + " #pragma omp parallel for num_threads(THREADS)\n", + " for (int i = 0; i < N; i++) {\n", + " #pragma omp critical\n", + " c++;\n", + " }\n", + " return omp_get_wtime() - t0;\n", + "}\n", + "\n", + "double bench_atomic() {\n", + " long long c = 0;\n", + " double t0 = omp_get_wtime();\n", + " #pragma omp parallel for num_threads(THREADS)\n", + " for (int i = 0; i < N; i++) {\n", + " #pragma omp atomic\n", + " c++;\n", + " }\n", + " return omp_get_wtime() - t0;\n", + "}\n", + "\n", + "double bench_reduction() {\n", + " long long c = 0;\n", + " double t0 = omp_get_wtime();\n", + " #pragma omp parallel for num_threads(THREADS) reduction(+:c)\n", + " for (int i = 0; i < N; i++) c++;\n", + " return omp_get_wtime() - t0;\n", + "}\n", + "\n", + "double avg(double (*fn)()) {\n", + " double s = 0;\n", + " for (int i = 0; i < RUNS; i++) s += fn();\n", + " return s / RUNS;\n", + "}\n", + "\n", + "printf(\"sequential: %.6f s\\n\", avg(bench_sequential));\n", + "printf(\"critical: %.6f s\\n\", avg(bench_critical));\n", + "printf(\"atomic: %.6f s\\n\", avg(bench_atomic));\n", + "printf(\"reduction: %.6f s\\n\", avg(bench_reduction));" + ] + }, + { + "cell_type": "markdown", + "id": "a26c94cb", + "metadata": {}, + "source": [ + "### Reading the results\n", + "\n", + "| Approach | Why it's slow/fast |\n", + "|----------|--------------------|\n", + "| **sequential** | Baseline: single-threaded, no sync overhead |\n", + "| **critical** | Threads serialise at the mutex - effectively sequential *plus* lock overhead |\n", + "| **atomic** | Hardware atomic instruction: much cheaper than a mutex, but still causes cache-line bouncing between cores |\n", + "| **reduction** | No synchronisation during the loop; one merge at the end - fastest of the parallel options |" + ] + }, + { + "cell_type": "markdown", + "id": "71d7fe12", + "metadata": {}, + "source": [ + "---\n", + "## 6. A More Realistic Example - Histogram\n", + "\n", + "Counters are a toy problem. A **histogram** is a real-world accumulation where multiple bins are updated, making `reduction` over a single variable insufficient. This shows `critical` vs `atomic` in a meaningful context." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "abcaac25", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "atomic (per element): 0.1241 s\n", + "private + merge: 0.0066 s (speedup vs atomic: 18.9x)\n", + "\n", + "Histogram (should sum to 10000000):\n", + " bin[ 0] = 623773\n", + " bin[ 1] = 624866\n", + " bin[ 2] = 624957\n", + " bin[ 3] = 624934\n", + " bin[ 4] = 625238\n", + " bin[ 5] = 626471\n", + " bin[ 6] = 625747\n", + " bin[ 7] = 624270\n", + " bin[ 8] = 624621\n", + " bin[ 9] = 624898\n", + " bin[10] = 624977\n", + " bin[11] = 625217\n", + " bin[12] = 624620\n", + " bin[13] = 625424\n", + " bin[14] = 625240\n", + " bin[15] = 624747\n", + "Total: 10000000\n" + ] + } + ], + "source": [ + "#include \n", + "#include \n", + "\n", + "#define N_SAMPLES 10000000\n", + "#define N_BINS 16\n", + "\n", + "// Approach A: atomic update per bin\n", + "double histogram_atomic(int *data, long long *hist) {\n", + " memset(hist, 0, N_BINS * sizeof(long long));\n", + " double t0 = omp_get_wtime();\n", + " #pragma omp parallel for num_threads(THREADS)\n", + " for (int i = 0; i < N_SAMPLES; i++) {\n", + " #pragma omp atomic\n", + " hist[data[i]]++;\n", + " }\n", + " return omp_get_wtime() - t0;\n", + "}\n", + "\n", + "// Approach B: private histogram per thread, merge at end\n", + "double histogram_private(int *data, long long *hist) {\n", + " memset(hist, 0, N_BINS * sizeof(long long));\n", + " double t0 = omp_get_wtime();\n", + " #pragma omp parallel num_threads(THREADS)\n", + " {\n", + " long long local[N_BINS] = {0}; // thread-private copy\n", + " #pragma omp for\n", + " for (int i = 0; i < N_SAMPLES; i++)\n", + " local[data[i]]++;\n", + " // Merge: each bin is independent, so atomic is fine here\n", + " for (int b = 0; b < N_BINS; b++) {\n", + " #pragma omp atomic\n", + " hist[b] += local[b];\n", + " }\n", + " }\n", + " return omp_get_wtime() - t0;\n", + "}\n", + "\n", + "// Generate random data\n", + "int *data = (int*)malloc(N_SAMPLES * sizeof(int));\n", + "srand(42);\n", + "for (int i = 0; i < N_SAMPLES; i++) data[i] = rand() % N_BINS;\n", + "\n", + "long long hist[N_BINS];\n", + "\n", + "double t_atomic = histogram_atomic(data, hist);\n", + "double t_private = histogram_private(data, hist);\n", + "\n", + "printf(\"atomic (per element): %.4f s\\n\", t_atomic);\n", + "printf(\"private + merge: %.4f s (speedup vs atomic: %.1fx)\\n\",\n", + " t_private, t_atomic / t_private);\n", + "\n", + "// Print histogram to verify correctness\n", + "long long total = 0;\n", + "printf(\"\\nHistogram (should sum to %d):\\n\", N_SAMPLES);\n", + "for (int b = 0; b < N_BINS; b++) { printf(\" bin[%2d] = %lld\\n\", b, hist[b]); total += hist[b]; }\n", + "printf(\"Total: %lld\\n\", total);\n", + "\n", + "free(data);" + ] + }, + { + "cell_type": "markdown", + "id": "6371ac17", + "metadata": {}, + "source": [ + "The **private histogram** pattern is the general-purpose technique when you can't express the accumulation as a single built-in `reduction`. It trades memory (one copy of the histogram per thread) for speed (zero disagreement during the loop).\n", + "\n", + "> **Rule of thumb**: If threads all write to the *same* bin at the same time, cache-line contention kills performance even with `atomic`. Private copies eliminate this entirely." + ] + }, + { + "cell_type": "markdown", + "id": "be248cdf", + "metadata": {}, + "source": [ + "---\n", + "## 7. Summary - When to Use What\n", + "\n", + "| Mechanism | Overhead | Flexibility | Best for |\n", + "|-----------|----------|-------------|----------|\n", + "| `reduction` | **Lowest** | Only built-in ops on loop variable | Sums, products, min/max in loops |\n", + "| `atomic` | Low | Single variable, supported operators | Simple shared counters/flags |\n", + "| `critical` | **Highest** | Arbitrary code | Complex shared-state updates |\n", + "| Private + merge | Low (memory) | Anything | Histograms, complex accumulators |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++23 + OpenMP", + "language": "cpp", + "name": "xcpp23-omp" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "C++", + "nbconvert_exporter": "", + "pygments_lexer": "", + "version": "cxx23" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/openmp/07_false_sharing.ipynb b/openmp/07_false_sharing.ipynb new file mode 100644 index 0000000..4419172 --- /dev/null +++ b/openmp/07_false_sharing.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# False Sharing\n", + "\n", + "You fixed the race condition\n", + "Your atomic code is correct\n", + "But it's still slow... why? → **False Sharing**\n", + "\n", + "---\n", + "\n", + "## What is a Cache Line?\n", + "\n", + "CPUs don't load individual bytes from RAM - they load chunks called **cache lines** (typically **64 bytes**).\n", + "\n", + "When Thread 0 writes to `hist[0]`, the CPU loads the **entire 64-byte line** into Thread 0's cache. \n", + "When Thread 1 writes to `hist[1]`, it **invalidates** that line on Thread 0's cache - even though they touched different variables!\n", + "\n", + "This is **false sharing** - the sharing is false because the threads aren't actually sharing data, but the CPU thinks they are." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Threads: 4\n", + "Iterations: 100000000\n", + "Cache line: 64 bytes\n", + "sizeof(int) = 4 bytes\n", + "4 ints fit in 16 bytes (one cache line = 64 bytes)\n" + ] + } + ], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "#include \n", + "\n", + "#define THREADS 4\n", + "#define ITERATIONS 100000000 // 100 million\n", + "#define CACHE_LINE 64 // bytes per cache line\n", + "\n", + "printf(\"Threads: %d\\n\", THREADS);\n", + "printf(\"Iterations: %d\\n\", ITERATIONS);\n", + "printf(\"Cache line: %d bytes\\n\", CACHE_LINE);\n", + "printf(\"sizeof(int) = %zu bytes\\n\", sizeof(int));\n", + "printf(\"4 ints fit in %zu bytes (one cache line = %d bytes)\\n\",\n", + " 4 * sizeof(int), CACHE_LINE);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Experiment 1 - False Sharing (Bad)\n", + "\n", + "Each thread writes to its own `counter[thread_id]` - **different variables**, so no race condition. \n", + "But all 4 counters fit in **one cache line** → every write invalidates the line for all other threads." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "False sharing: 0.2637 s (total = 100000000)\n" + ] + } + ], + "source": [ + "void false_sharing_demo() {\n", + " // All 4 counters sit next to each other in memory = one cache line\n", + " int counters[THREADS];\n", + " memset(counters, 0, sizeof(counters));\n", + "\n", + " double t0 = omp_get_wtime();\n", + "\n", + " #pragma omp parallel num_threads(THREADS)\n", + " {\n", + " int tid = omp_get_thread_num();\n", + " for (int i = 0; i < ITERATIONS / THREADS; i++) {\n", + " counters[tid]++; // No race condition, but FALSE SHARING!\n", + " }\n", + " }\n", + "\n", + " double elapsed = omp_get_wtime() - t0;\n", + "\n", + " long long total = 0;\n", + " for (int t = 0; t < THREADS; t++) total += counters[t];\n", + " printf(\"False sharing: %.4f s (total = %lld)\\n\", elapsed, total);\n", + "}\n", + "\n", + "false_sharing_demo();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Experiment 2 - No False Sharing (Padded)\n", + "\n", + "We pad each counter so it occupies a **full 64-byte cache line** by itself. \n", + "Now each thread's counter is on its own cache line → writes don't interfere.\n", + "\n", + "```\n", + "Cache line 1 (64 bytes) Cache line 2 (64 bytes)\n", + "┌────────┬──────────────────┐ ┌────────┬──────────────────┐\n", + "│counter0│ padding(60B) │ │counter1│ padding(60B) │\n", + "└────────┴──────────────────┘ └────────┴──────────────────┘\n", + " Thread 0 only Thread 1 only\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Padded (no FS): 0.0347 s (total = 100000000)\n" + ] + } + ], + "source": [ + "void no_false_sharing_demo() {\n", + " // Pad each counter to 64 bytes so each lives on its own cache line\n", + " struct {\n", + " int value;\n", + " char padding[60]; // 4 + 60 = 64 bytes = one full cache line\n", + " } counters[THREADS];\n", + "\n", + " for (int t = 0; t < THREADS; t++) counters[t].value = 0;\n", + "\n", + " double t0 = omp_get_wtime();\n", + "\n", + " #pragma omp parallel num_threads(THREADS)\n", + " {\n", + " int tid = omp_get_thread_num();\n", + " for (int i = 0; i < ITERATIONS / THREADS; i++) {\n", + " counters[tid].value++; // Each thread has its own cache line\n", + " }\n", + " }\n", + "\n", + " double elapsed = omp_get_wtime() - t0;\n", + "\n", + " long long total = 0;\n", + " for (int t = 0; t < THREADS; t++) total += counters[t].value;\n", + " printf(\"Padded (no FS): %.4f s (total = %lld)\\n\", elapsed, total);\n", + "}\n", + "\n", + "no_false_sharing_demo();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Experiment 3 - Private Variables (Best)\n", + "\n", + "Even better than padding: use **thread-private local variables** on the stack. \n", + "Stack variables are guaranteed to be in the thread's own memory - no sharing at all." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Private vars: 0.0433 s (total = 100000000)\n" + ] + } + ], + "source": [ + "void private_vars_demo() {\n", + " long long total = 0;\n", + "\n", + " double t0 = omp_get_wtime();\n", + "\n", + " #pragma omp parallel num_threads(THREADS) reduction(+:total)\n", + " {\n", + " long long local = 0; // lives on THIS thread's stack - completely private\n", + " for (int i = 0; i < ITERATIONS / THREADS; i++) {\n", + " local++; // no sharing of any kind\n", + " }\n", + " total += local; // reduction merges at the end\n", + " }\n", + "\n", + " double elapsed = omp_get_wtime() - t0;\n", + " printf(\"Private vars: %.4f s (total = %lld)\\n\", elapsed, total);\n", + "}\n", + "\n", + "private_vars_demo();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Experiment 4 - All Three Side by Side with Speedup" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Method Time Speedup vs FS\n", + "--------------------------------------------\n", + " False sharing: 0.2729 s 1.0x\n", + " Padded: 0.0315 s 8.7x\n", + " Private vars: 0.0425 s 6.4x\n" + ] + } + ], + "source": [ + "void run_all() {\n", + " double t0, t_fs, t_pad, t_priv;\n", + " long long total;\n", + "\n", + " //False Sharing\n", + " int counters_bad[THREADS];\n", + " memset(counters_bad, 0, sizeof(counters_bad));\n", + " t0 = omp_get_wtime();\n", + " #pragma omp parallel num_threads(THREADS)\n", + " {\n", + " int tid = omp_get_thread_num();\n", + " for (int i = 0; i < ITERATIONS / THREADS; i++)\n", + " counters_bad[tid]++;\n", + " }\n", + " t_fs = omp_get_wtime() - t0;\n", + "\n", + " //Padded\n", + " struct { int value; char pad[60]; } counters_good[THREADS];\n", + " for (int t = 0; t < THREADS; t++) counters_good[t].value = 0;\n", + " t0 = omp_get_wtime();\n", + " #pragma omp parallel num_threads(THREADS)\n", + " {\n", + " int tid = omp_get_thread_num();\n", + " for (int i = 0; i < ITERATIONS / THREADS; i++)\n", + " counters_good[tid].value++;\n", + " }\n", + " t_pad = omp_get_wtime() - t0;\n", + "\n", + " //Private\n", + " total = 0;\n", + " t0 = omp_get_wtime();\n", + " #pragma omp parallel num_threads(THREADS) reduction(+:total)\n", + " {\n", + " long long local = 0;\n", + " for (int i = 0; i < ITERATIONS / THREADS; i++)\n", + " local++;\n", + " total += local;\n", + " }\n", + " t_priv = omp_get_wtime() - t0;\n", + "\n", + " //Results\n", + " printf(\" Method Time Speedup vs FS\\n\");\n", + " printf(\"--------------------------------------------\\n\");\n", + " printf(\" False sharing: %.4f s 1.0x\\n\", t_fs);\n", + " printf(\" Padded: %.4f s %.1fx\\n\", t_pad, t_fs / t_pad);\n", + " printf(\" Private vars: %.4f s %.1fx\\n\", t_priv, t_fs / t_priv);\n", + "}\n", + "\n", + "run_all();" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++23 + OpenMP", + "language": "cpp", + "name": "xcpp23-omp" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "C++", + "nbconvert_exporter": "", + "pygments_lexer": "", + "version": "cxx23" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/openmp/08_game_of_life.ipynb b/openmp/08_game_of_life.ipynb new file mode 100644 index 0000000..09f39dc --- /dev/null +++ b/openmp/08_game_of_life.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conway's Game of Life - Parallel with OpenMP\n", + "\n", + "## The Rules (just 4)\n", + "\n", + "Each cell is either **alive** (1) or **dead** (0). Every generation, each cell looks at its **8 neighbours**:\n", + "\n", + "```\n", + "┌───┬───┬───┐\n", + "│ N │ N │ N │\n", + "├───┼───┼───┤\n", + "│ N │ X │ N │ X = current cell, N = neighbours\n", + "├───┼───┼───┤\n", + "│ N │ N │ N │\n", + "└───┴───┴───┘\n", + "```\n", + "\n", + "| Cell state | Neighbours | Next state | Rule name |\n", + "|---|---|---|---|\n", + "| Alive | < 2 | Dies | Underpopulation |\n", + "| Alive | 2 or 3 | Survives | Survival |\n", + "| Alive | > 3 | Dies | Overpopulation |\n", + "| Dead | exactly 3 | Born | Reproduction |\n", + "\n", + "Simple rules → incredibly complex emergent behaviour.\n", + "\n", + "---\n", + "\n", + "## The Parallelism Problem\n", + "\n", + "Every cell needs to **read** its neighbours and **write** its new state simultaneously.\n", + "If we read and write the same grid, a cell updated early would corrupt the neighbours of cells updated later.\n", + "\n", + "The solution: **double buffering** - two grids, always read from one, write to the other, then swap.\n", + "\n", + "---\n", + "## 1. Setup & Grid Utilities" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Grid size: 8 x 16 = 128 cells\n", + "Threads: 4\n", + "Each thread handles ~2 rows\n" + ] + } + ], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "#include \n", + "\n", + "#define ROWS 8\n", + "#define COLS 16\n", + "#define THREADS 4\n", + "\n", + "// Two grids - the core of double buffering\n", + "// We ALWAYS read from grid[0], write to grid[1], then swap pointers\n", + "int grid_A[ROWS][COLS];\n", + "int grid_B[ROWS][COLS];\n", + "int (*current)[COLS] = grid_A; // pointer to the grid we READ from\n", + "int (*next)[COLS] = grid_B; // pointer to the grid we WRITE to\n", + "\n", + "// Print the grid nicely\n", + "void print_grid(int g[ROWS][COLS], int gen) {\n", + " printf(\"Generation %d:\\n\", gen);\n", + " for (int r = 0; r < ROWS; r++) {\n", + " printf(\"|\");\n", + " for (int c = 0; c < COLS; c++)\n", + " printf(\"%c\", g[r][c] ? '#' : '.');\n", + " printf(\"|\\n\");\n", + " }\n", + "}\n", + "\n", + "// Count living cells\n", + "int count_alive(int g[ROWS][COLS]) {\n", + " int count = 0;\n", + " for (int r = 0; r < ROWS; r++)\n", + " for (int c = 0; c < COLS; c++)\n", + " count += g[r][c];\n", + " return count;\n", + "}\n", + "\n", + "printf(\"Grid size: %d x %d = %d cells\\n\", ROWS, COLS, ROWS * COLS);\n", + "printf(\"Threads: %d\\n\", THREADS);\n", + "printf(\"Each thread handles ~%d rows\\n\", ROWS / THREADS);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 2. Why Double Buffering?\n", + "\n", + "Before we parallelize, let's understand the core problem.\n", + "\n", + "```\n", + "WRONG - single grid (read and write same grid):\n", + "\n", + " Step 1: update cell (2,3) → writes new value into grid[2][3]\n", + " Step 2: update cell (2,4) → reads grid[2][3] as a neighbour\n", + " BUT grid[2][3] is already the NEW value!\n", + " Cell (2,4) is computing based on corrupted data.\n", + "\n", + "CORRECT - double buffer:\n", + " \n", + " current[][] = old state (READ ONLY)\n", + " next[][] = new state (WRITE ONLY)\n", + "\n", + " All cells read from current[][] simultaneously - no corruption possible\n", + " All cells write to next[][] simultaneously - no conflicts\n", + " After all cells done: swap pointers (next becomes current)\n", + "```\n", + "\n", + "This is exactly what makes it safe to parallelize - threads can compute any cell in any order because they all read from the same frozen snapshot.\n", + "\n", + "---\n", + "## 3. The Step Function (Core Logic)\n", + "\n", + "`collapse(2)` flattens both loops so OpenMP splits all **128 cells** (8×16) across 4 threads — 32 cells per thread." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "step() and count_neighbours() defined.\n", + "collapse(2) flattens the 2D loop into 1D so OpenMP can split 8*16=128 cells across 4 threads\n" + ] + } + ], + "source": [ + "// Count live neighbours of cell (r, c)\n", + "// Uses wrap-around edges (toroidal grid — top connects to bottom, left to right)\n", + "int count_neighbours(int g[ROWS][COLS], int r, int c) {\n", + " int count = 0;\n", + " for (int dr = -1; dr <= 1; dr++) {\n", + " for (int dc = -1; dc <= 1; dc++) {\n", + " if (dr == 0 && dc == 0) continue; // skip self\n", + " int nr = (r + dr + ROWS) % ROWS; // wrap around vertically\n", + " int nc = (c + dc + COLS) % COLS; // wrap around horizontally\n", + " count += g[nr][nc];\n", + " }\n", + " }\n", + " return count;\n", + "}\n", + "\n", + "// One generation step - parallel with OpenMP\n", + "// READ from: src (frozen snapshot)\n", + "// WRITE to: dst (fresh empty grid)\n", + "void step(int src[ROWS][COLS], int dst[ROWS][COLS]) {\n", + " #pragma omp parallel for num_threads(THREADS) collapse(2)\n", + " for (int r = 0; r < ROWS; r++) {\n", + " for (int c = 0; c < COLS; c++) {\n", + " int neighbours = count_neighbours(src, r, c);\n", + " int alive = src[r][c];\n", + "\n", + " // Apply the 4 rules\n", + " if (alive && neighbours < 2) dst[r][c] = 0; // underpopulation\n", + " else if (alive && neighbours <= 3) dst[r][c] = 1; // survival\n", + " else if (alive && neighbours > 3) dst[r][c] = 0; // overpopulation\n", + " else if (!alive && neighbours == 3) dst[r][c] = 1; // reproduction\n", + " else dst[r][c] = 0; // stays dead\n", + " }\n", + " }\n", + "}\n", + "\n", + "printf(\"step() and count_neighbours() defined.\\n\");\n", + "printf(\"collapse(2) flattens the 2D loop into 1D so OpenMP can split %d*%d=%d cells across %d threads\\n\",\n", + " ROWS, COLS, ROWS*COLS, THREADS);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 4. `collapse(2)` Explained\n", + "\n", + "Without `collapse(2)`, OpenMP only splits the **outer loop** (rows):\n", + "```\n", + "Thread 0 → rows 0-1 (each doing all 16 columns)\n", + "Thread 1 → rows 2-3\n", + "Thread 2 → rows 4-5\n", + "Thread 3 → rows 6-7\n", + "```\n", + "With `collapse(2)`, OpenMP merges both loops and splits **all 128 cells**:\n", + "```\n", + "Thread 0 → cells 0-31\n", + "Thread 1 → cells 32-63\n", + "Thread 2 → cells 64-95\n", + "Thread 3 → cells 96-127\n", + "```\n", + "On a small 8×16 grid this matters more — without collapse, each thread only gets 2 rows which is very coarse granularity." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 5. Pattern 1: Blinker (Period-2 Oscillator)\n", + "\n", + "The simplest oscillator - 3 cells in a line that flip between horizontal and vertical every generation. Placed in the middle of the 8×16 grid at row 4, col 8." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Blinker (should alternate horizontal/vertical) ===\n", + "\n", + "Generation 0:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|.......###......|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "Generation 1:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|........#.......|\n", + "|........#.......|\n", + "|........#.......|\n", + "|................|\n", + "|................|\n", + "Generation 2:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|.......###......|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "Generation 3:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|........#.......|\n", + "|........#.......|\n", + "|........#.......|\n", + "|................|\n", + "|................|\n", + "Generation 4:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|.......###......|\n", + "|................|\n", + "|................|\n", + "|................|\n" + ] + } + ], + "source": [ + "void run_blinker() {\n", + " // Reset both grids\n", + " memset(grid_A, 0, sizeof(grid_A));\n", + " memset(grid_B, 0, sizeof(grid_B));\n", + " current = grid_A;\n", + " next = grid_B;\n", + "\n", + " // Place a horizontal blinker in the middle\n", + " int r = ROWS / 2;\n", + " int c = COLS / 2;\n", + " current[r][c-1] = 1;\n", + " current[r][c] = 1;\n", + " current[r][c+1] = 1;\n", + "\n", + " printf(\"=== Blinker (should alternate horizontal/vertical) ===\\n\\n\");\n", + " print_grid(current, 0);\n", + "\n", + " for (int gen = 1; gen <= 4; gen++) {\n", + " step(current, next);\n", + "\n", + " // Swap buffers — next becomes current, current becomes next\n", + " int (*tmp)[COLS] = current;\n", + " current = next;\n", + " next = tmp;\n", + "\n", + " print_grid(current, gen);\n", + " }\n", + "}\n", + "\n", + "run_blinker();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 6. Pattern 2: Glider\n", + "\n", + "The glider moves diagonally across the grid forever - it takes 4 generations to return to its original shape but shifted by 1 cell diagonally." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Glider (watch it move diagonally) ===\n", + "\n", + "Generation 0:\n", + "|................|\n", + "|..#.............|\n", + "|...#............|\n", + "|.###............|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "Generation 1:\n", + "|................|\n", + "|................|\n", + "|.#.#............|\n", + "|..##............|\n", + "|..#.............|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "Generation 2:\n", + "|................|\n", + "|................|\n", + "|...#............|\n", + "|.#.#............|\n", + "|..##............|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "Generation 3:\n", + "|................|\n", + "|................|\n", + "|..#.............|\n", + "|...##...........|\n", + "|..##............|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "Generation 4:\n", + "|................|\n", + "|................|\n", + "|...#............|\n", + "|....#...........|\n", + "|..###...........|\n", + "|................|\n", + "|................|\n", + "|................|\n", + "Generation 5:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|..#.#...........|\n", + "|...##...........|\n", + "|...#............|\n", + "|................|\n", + "|................|\n", + "Generation 6:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|....#...........|\n", + "|..#.#...........|\n", + "|...##...........|\n", + "|................|\n", + "|................|\n", + "Generation 7:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|...#............|\n", + "|....##..........|\n", + "|...##...........|\n", + "|................|\n", + "|................|\n", + "Generation 8:\n", + "|................|\n", + "|................|\n", + "|................|\n", + "|....#...........|\n", + "|.....#..........|\n", + "|...###..........|\n", + "|................|\n", + "|................|\n" + ] + } + ], + "source": [ + "void run_glider() {\n", + " memset(grid_A, 0, sizeof(grid_A));\n", + " memset(grid_B, 0, sizeof(grid_B));\n", + " current = grid_A;\n", + " next = grid_B;\n", + "\n", + " // Classic glider pattern:\n", + " // .X.\n", + " // ..X\n", + " // XXX\n", + " current[1][2] = 1;\n", + " current[2][3] = 1;\n", + " current[3][1] = 1;\n", + " current[3][2] = 1;\n", + " current[3][3] = 1;\n", + "\n", + " printf(\"=== Glider (watch it move diagonally) ===\\n\\n\");\n", + " print_grid(current, 0);\n", + "\n", + " for (int gen = 1; gen <= 8; gen++) {\n", + " step(current, next);\n", + " int (*tmp)[COLS] = current;\n", + " current = next;\n", + " next = tmp;\n", + "\n", + " //if (gen % 2 == 0) // print every 2 generations to keep output short\n", + " print_grid(current, gen);\n", + " }\n", + "}\n", + "\n", + "run_glider();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 7. Performance: Sequential vs Parallel" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Grid: 512x512 = 262144 cells, 100 generations\n", + "=============================================\n", + " Sequential: 1.3265 s\n", + " Parallel: 0.3410 s (3.9x speedup)\n", + "=============================================\n" + ] + } + ], + "source": [ + "#define BENCH_ROWS 512\n", + "#define BENCH_COLS 512\n", + "#define GENERATIONS 100\n", + "\n", + "// Allocate large grids on the heap for benchmarking\n", + "int *bench_A = (int*)calloc(BENCH_ROWS * BENCH_COLS, sizeof(int));\n", + "int *bench_B = (int*)calloc(BENCH_ROWS * BENCH_COLS, sizeof(int));\n", + "\n", + "#define BENCH_IDX(r, c) ((r) * BENCH_COLS + (c))\n", + "\n", + "// Seed with random pattern\n", + "srand(42);\n", + "for (int i = 0; i < BENCH_ROWS * BENCH_COLS; i++)\n", + " bench_A[i] = rand() % 2;\n", + "\n", + "// Sequential step\n", + "void step_seq(int *src, int *dst) {\n", + " for (int r = 0; r < BENCH_ROWS; r++) {\n", + " for (int c = 0; c < BENCH_COLS; c++) {\n", + " int n = 0;\n", + " for (int dr = -1; dr <= 1; dr++)\n", + " for (int dc = -1; dc <= 1; dc++) {\n", + " if (dr == 0 && dc == 0) continue;\n", + " int nr = (r + dr + BENCH_ROWS) % BENCH_ROWS;\n", + " int nc = (c + dc + BENCH_COLS) % BENCH_COLS;\n", + " n += src[BENCH_IDX(nr, nc)];\n", + " }\n", + " int alive = src[BENCH_IDX(r, c)];\n", + " dst[BENCH_IDX(r, c)] = (alive && n == 2) || n == 3 ? 1 : 0;\n", + " }\n", + " }\n", + "}\n", + "\n", + "// Parallel step\n", + "void step_par(int *src, int *dst) {\n", + " #pragma omp parallel for num_threads(THREADS) collapse(2)\n", + " for (int r = 0; r < BENCH_ROWS; r++) {\n", + " for (int c = 0; c < BENCH_COLS; c++) {\n", + " int n = 0;\n", + " for (int dr = -1; dr <= 1; dr++)\n", + " for (int dc = -1; dc <= 1; dc++) {\n", + " if (dr == 0 && dc == 0) continue;\n", + " int nr = (r + dr + BENCH_ROWS) % BENCH_ROWS;\n", + " int nc = (c + dc + BENCH_COLS) % BENCH_COLS;\n", + " n += src[BENCH_IDX(nr, nc)];\n", + " }\n", + " int alive = src[BENCH_IDX(r, c)];\n", + " dst[BENCH_IDX(r, c)] = (alive && n == 2) || n == 3 ? 1 : 0;\n", + " }\n", + " }\n", + "}\n", + "\n", + "// Benchmark sequential\n", + "int *src = bench_A, *dst = bench_B;\n", + "double t0 = omp_get_wtime();\n", + "for (int g = 0; g < GENERATIONS; g++) {\n", + " step_seq(src, dst);\n", + " int *tmp = src; src = dst; dst = tmp;\n", + "}\n", + "double t_seq = omp_get_wtime() - t0;\n", + "\n", + "// Reset and benchmark parallel\n", + "srand(42);\n", + "for (int i = 0; i < BENCH_ROWS * BENCH_COLS; i++) bench_A[i] = rand() % 2;\n", + "src = bench_A; dst = bench_B;\n", + "t0 = omp_get_wtime();\n", + "for (int g = 0; g < GENERATIONS; g++) {\n", + " step_par(src, dst);\n", + " int *tmp = src; src = dst; dst = tmp;\n", + "}\n", + "double t_par = omp_get_wtime() - t0;\n", + "\n", + "printf(\"Grid: %dx%d = %d cells, %d generations\\n\",\n", + " BENCH_ROWS, BENCH_COLS, BENCH_ROWS * BENCH_COLS, GENERATIONS);\n", + "printf(\"=============================================\\n\");\n", + "printf(\" Sequential: %.4f s\\n\", t_seq);\n", + "printf(\" Parallel: %.4f s (%.1fx speedup)\\n\", t_par, t_seq / t_par);\n", + "printf(\"=============================================\\n\");\n", + "\n", + "free(bench_A);\n", + "free(bench_B);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## 8. Why Double Buffering Has Zero Race Conditions\n", + "\n", + "Let's prove to ourselves it's safe by thinking through what each thread touches:\n", + "\n", + "```\n", + "Generation N (frozen - nobody writes here):\n", + "┌─────────────────────────────────────┐\n", + "│ current[][] READ ONLY │\n", + "│ Thread 0 reads rows 0-1 │\n", + "│ Thread 1 reads rows 2-3 │\n", + "│ Thread 2 reads rows 4-5 │\n", + "│ Thread 3 reads rows 6-7 │\n", + "│ (multiple readers = always safe) │\n", + "└─────────────────────────────────────┘\n", + "\n", + "Generation N+1 (being built - nobody reads here yet):\n", + "┌─────────────────────────────────────┐\n", + "│ next[][] WRITE ONLY │\n", + "│ Thread 0 writes rows 0-1 │\n", + "│ Thread 1 writes rows 2-3 │\n", + "│ Thread 2 writes rows 4-5 │\n", + "│ Thread 3 writes rows 6-7 │\n", + "│ (each thread owns different rows) │\n", + "└─────────────────────────────────────┘\n", + "\n", + "After all threads finish:\n", + " swap(current, next) ← just swapping two pointers, O(1)\n", + "```\n", + "\n", + "No atomics needed. No critical sections. No locks. \n", + "The double buffer design **eliminates** the race condition structurally." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++23 + OpenMP", + "language": "cpp", + "name": "xcpp23-omp" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "C++", + "nbconvert_exporter": "", + "pygments_lexer": "", + "version": "cxx23" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/openmp/images/fork_join.png b/openmp/images/fork_join.png new file mode 100644 index 0000000..b67ec9a Binary files /dev/null and b/openmp/images/fork_join.png differ diff --git a/openmp/mandel.ipynb b/openmp/mandel.ipynb deleted file mode 100644 index 08a6b64..0000000 --- a/openmp/mandel.ipynb +++ /dev/null @@ -1,133 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "5059dbdd-821d-498a-8716-eb0fcf8a8f5f", - "metadata": {}, - "outputs": [], - "source": [ - "#include \n", - "#include \n", - "#include \n", - "#include " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "8b66f96a-14ef-4f23-8024-bcfc42b31e4e", - "metadata": {}, - "outputs": [], - "source": [ - "#define NPOINTS 1000\n", - "#define MAXITER 1000" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "d89dd57c-fe19-4233-a33a-df9b24fae98a", - "metadata": {}, - "outputs": [], - "source": [ - "int numoutside = 0;" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5c35c479-2f79-46b7-bc66-24be6b1694e0", - "metadata": {}, - "outputs": [], - "source": [ - "void testpoint(double creal, double cimag) {\n", - " // iterate z=z*z+c, until |z| > 2 when point is known to be outside set\n", - " // If loop count reaches MAXITER, point is considered to be inside the set\n", - "\n", - " double zreal, zimag, temp;\n", - " int iter;\n", - " zreal = creal;\n", - " zimag = cimag;\n", - "\n", - " for (iter = 0; iter < MAXITER; iter++) {\n", - " temp = (zreal * zreal) - (zimag * zimag) + creal;\n", - " zimag = zreal * zimag * 2 + cimag;\n", - " zreal = temp;\n", - " if ((zreal * zreal + zimag * zimag) > 4.0) {\n", - " numoutside++;\n", - " break;\n", - " }\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ea116fef-7d05-4e29-97a1-55c85c7241d8", - "metadata": {}, - "outputs": [], - "source": [ - "int main() {\n", - " int i, j;\n", - " double area, error, eps = 1.0e-5;\n", - " double cimag, creal;\n", - " // Loop over grid of points in the complex plane which contains the Mandelbrot set,\n", - " // testing each point to see whether it is inside or outside the set.\n", - "\n", - "#pragma omp parallel for private(eps)\n", - " for (i = 0; i < NPOINTS; i++) {\n", - " for (j = 0; j < NPOINTS; j++) {\n", - " creal = -2.0 + 2.5 * (double) (i) / (double) (NPOINTS) + eps;\n", - " cimag = 1.125 * (double) (j) / (double) (NPOINTS) + eps;\n", - " testpoint(creal, cimag);\n", - " }\n", - " }\n", - "\n", - " // Calculate area of set and error estimate and output the results\n", - " area = 2.0 * 2.5 * 1.125 * (double) (NPOINTS * NPOINTS - numoutside) / (double) (NPOINTS * NPOINTS);\n", - " error = area / (double) NPOINTS;\n", - "\n", - " printf(\"Area of Mandlebrot set = %12.8f +/- %12.8f\\n\", area, error);\n", - " printf(\"Correct answer should be around 1.510659\\n\");\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "39cf129c-8106-4e67-a2f1-1a7fff17cd38", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Area of Mandlebrot set = 3.80247750 +/- 0.00380248\n", - "Correct answer should be around 1.510659\n" - ] - } - ], - "source": [ - "main();" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "C++23 (xcpp+OpenMP)", - "language": "cpp", - "name": "xcpp23-omp" - }, - "language_info": { - "codemirror_mode": "text/x-c++src", - "file_extension": ".cpp", - "mimetype": "text/x-c++src", - "name": "C++", - "version": "23" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/openmp/openmp-demo.ipynb b/openmp/openmp-demo.ipynb deleted file mode 100644 index 101ade0..0000000 --- a/openmp/openmp-demo.ipynb +++ /dev/null @@ -1,220 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "b0c15570-ee24-42ed-b61f-11a3fc858b2d", - "metadata": {}, - "outputs": [], - "source": [ - "#include \n", - "#include " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5001e441-1fa5-4bdc-9fa5-2ca103ae484f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello World!\n" - ] - } - ], - "source": [ - "void example1() {\n", - " std::cout << \"Hello World!\" << std::endl;\n", - "}\n", - "example1();" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "53fb7656-b72e-42bc-ade7-2ae2077142da", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello World!Hello World!\n", - "Hello World!Hello World!\n", - "Hello World!\n", - "\n", - "Hello World!\n", - "\n", - "Hello World!\n", - "Hello World!\n" - ] - } - ], - "source": [ - "void example2() {\n", - " #pragma omp parallel\n", - " {\n", - " std::cout << \"Hello World!\" << std::endl;\n", - " }\n", - "}\n", - "example2();" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "efcdfdb6-a60b-46af-8194-75ef9cc0e27f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello World! (Hello World! (Hello World! (Hello World! (34)\n", - "Hello World! (7)\n", - "0)\n", - "2Hello World! (6))\n", - "\n", - ")\n", - "Hello World! (5)\n", - "Hello World! (1)\n" - ] - } - ], - "source": [ - "void example3() {\n", - " #pragma omp parallel\n", - " {\n", - " std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", - " }\n", - "}\n", - "example3();" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d86a9efa-ba28-4cb6-bbfc-abc00ee63506", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello World! (Hello World! (34))\n", - "Hello World! (0)\n", - "Hello World! (2)\n", - "Hello World! (Hello World! (1)\n", - "\n", - "7)Hello World! (\n", - "6)\n", - "Hello World! (5)\n", - "This is another message! (0)\n", - "Goodbye World! (0)\n", - "Goodbye World! (1)\n" - ] - } - ], - "source": [ - "void example4() {\n", - " #pragma omp parallel\n", - " {\n", - " std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", - " }\n", - "\n", - " std::cout << \"This is another message! (\" << omp_get_thread_num() << \")\" << std::endl;\n", - "\n", - " #pragma omp parallel num_threads(2)\n", - " {\n", - " std::cout << \"Goodbye World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", - " }\n", - "}\n", - "example4();" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "5557e01a-7c7d-4b54-8545-962ad11027df", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Initialize a[] time: 0.588681\n", - "Initialize b[] time: 0.513927\n", - "Add arrays time: 1.58928\n", - "Average result time: 0.637053\n", - "Average: 5e+08\n", - "Total time: 3.33191\n" - ] - } - ], - "source": [ - "void example5() {\n", - " double start_time = omp_get_wtime();\n", - " double start_loop;\n", - " \n", - " const int N = 1000000000;\n", - " int* a = new int[N];\n", - " int* b = new int[N];\n", - " \n", - " start_loop = omp_get_wtime();\n", - " #pragma omp parallel for\n", - " for (int i=0; i\n", - "#include " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9078ac79-ca50-4fef-b785-37f35fec3cab", - "metadata": {}, - "outputs": [], - "source": [ - "static long num_steps = 100000000;\n", - "double step;" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f3c10995-6f29-4d71-9e61-1993ca9d1cc9", - "metadata": {}, - "outputs": [], - "source": [ - "int main() {\n", - " int i, j, num_threads_allocated;\n", - " double x, pi, sum = 0.0;\n", - " double start_time, run_time;\n", - "\n", - " step = 1.0 / (double)num_steps;\n", - " printf(\"Num threads available: %d\\n\", omp_get_max_threads());\n", - " for (i = 1; i <= 4; i++) {\n", - " sum = 0.0;\n", - " omp_set_num_threads(i);\n", - " start_time = omp_get_wtime();\n", - "#pragma omp parallel\n", - " {\n", - " num_threads_allocated = omp_get_num_threads();\n", - "#pragma omp single\n", - " printf(\"Num threads allocated for this run: %d\\n\", num_threads_allocated);\n", - "\n", - "#pragma omp for reduction(+ : sum)\n", - " for (j = 1; j <= num_steps; j++) {\n", - " x = (j - 0.5) * step;\n", - " sum = sum + 4.0 / (1.0 + x * x);\n", - " }\n", - " }\n", - "\n", - " pi = step * sum;\n", - " run_time = omp_get_wtime() - start_time;\n", - " printf(\"pi is %f in %f seconds using %d threads\\n\\n\", pi, run_time, num_threads_allocated);\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "0f84442a-d947-4860-bd3c-aeeea963b419", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Num threads available: 8\n", - "Num threads allocated for this run: 1\n", - "pi is 3.141593 in 0.179501 seconds using 1 threads\n", - "\n", - "Num threads allocated for this run: 2\n", - "pi is 3.141592 in 0.184605 seconds using 2 threads\n", - "\n", - "Num threads allocated for this run: 3\n", - "pi is 3.141593 in 0.097145 seconds using 3 threads\n", - "\n", - "Num threads allocated for this run: 4\n", - "pi is 3.141593 in 0.071473 seconds using 4 threads\n", - "\n" - ] - } - ], - "source": [ - "main();" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "C++23 (xcpp+OpenMP)", - "language": "cpp", - "name": "xcpp23-omp" - }, - "language_info": { - "codemirror_mode": "text/x-c++src", - "file_extension": ".cpp", - "mimetype": "text/x-c++src", - "name": "C++", - "version": "23" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}