diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 000000000..2349ab411
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,16 @@
+version: 2
+jobs:
+  build:
+    machine: true
+    working_directory: /home/circleci/project
+
+    steps:
+      - checkout
+
+      - run:
+          name: Build the docker image
+          command: docker build -t xmr-stak:$CIRCLE_BRANCH /home/circleci/project
+
+      - run:
+          name: Run a benchmark with Monero V8
+          command: docker run --rm -t xmr-stak:$CIRCLE_BRANCH /usr/local/bin/xmr-benchmark.sh
\ No newline at end of file
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..25ba43d61
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,14 @@
+IndentWidth: 4
+TabWidth: 4
+ColumnLimit: 0
+BreakBeforeBraces: Allman
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false
+SpaceBeforeParens: Never
+UseTab: Always
+AlignAfterOpenBracket: DontAlign
+PointerBindsToType: true
+BreakConstructorInitializers: AfterColon
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
deleted file mode 100644
index 8451f3289..000000000
--- a/.github/ISSUE_TEMPLATE.md
+++ /dev/null
@@ -1,30 +0,0 @@
-Please provide as much as possible information to reproduce the issue.
-
-# Basic information
-  - Type of the CPU.
-  - Type of the GPU (if you try to miner with the GPU).
-
-# Compile issues
-  - Which OS do you use?
-  ```
-  add **all** commands you used and the **full** compile output here
-  ```
-  ```
-  run `cmake -LA .` in the build folder and add the output here
-  ```
-
-# Issue with the execution
-  - Do you compiled the miner by our own?
-  ```
-  run `./xmr-stak --version-long` and add the output here
-  ```
-
-# AMD OpenCl issue
-
-  ```
-  run `clinfo` and add the output here
-  ```
-
-# Stability issue
-  - Is the CPU or GPU overclocked?
-  - Is the Main memory of the CPU or GPU undervolted?
diff --git a/.github/ISSUE_TEMPLATE/compile_bug_report.md b/.github/ISSUE_TEMPLATE/compile_bug_report.md
new file mode 100644
index 000000000..899ad941f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/compile_bug_report.md
@@ -0,0 +1,35 @@
+---
+name: Compile bug report
+about: You have an issue to compile xmr-stak.
+
+---
+
+`...` are the placeholder for your answers. Please answer each question!
+
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Which operating system do you use? **
+
+```
+...
+```
+
+**To Reproduce**
+```
+# Please post all commands and the output.
+...
+```
+
+**Additional information.**
+
+```
+# run `cmake -LA .` in the build folder and add the output here
+...
+```
+
+**Feel free to add more information.**
+```
+...
+```
diff --git a/.github/ISSUE_TEMPLATE/execution_bug_report.md b/.github/ISSUE_TEMPLATE/execution_bug_report.md
new file mode 100644
index 000000000..44ac89bf1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/execution_bug_report.md
@@ -0,0 +1,7 @@
+---
+name: Execution bug report
+about: You have an issue to execute xmr-stak.
+
+---
+
+**Most execution issues are caused by driver problems. Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to ask for help instead of opening an issue here.**
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 000000000..90f5e4f3d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,7 @@
+---
+name: Feature request
+about: Suggest an idea for xmr-stak.
+
+---
+
+**Please explain the feature as good as possible.**
diff --git a/.github/ISSUE_TEMPLATE/tuning_help.md b/.github/ISSUE_TEMPLATE/tuning_help.md
new file mode 100644
index 000000000..40dedef05
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/tuning_help.md
@@ -0,0 +1,7 @@
+---
+name: Need help for optimization.
+about: You need help to optimize your setup.
+
+---
+
+**Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to discuss optimizations.**
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41e993eee..795829e66 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,13 +44,13 @@ endif()
 set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "${BUILD_TYPE}")
 
 set(XMR-STAK_COMPILE "native" CACHE STRING "select CPU compute architecture")
-set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic")
+set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic;dev_release")
 if(XMR-STAK_COMPILE STREQUAL "native")
     if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
         set(CMAKE_CXX_FLAGS "-march=native -mtune=native ${CMAKE_CXX_FLAGS}")
         set(CMAKE_C_FLAGS "-march=native -mtune=native ${CMAKE_C_FLAGS}")
     endif()
-elseif(XMR-STAK_COMPILE STREQUAL "generic")
+elseif(XMR-STAK_COMPILE STREQUAL "generic" OR XMR-STAK_COMPILE STREQUAL "dev_release")
     add_definitions("-DCONF_ENFORCE_OpenCL_1_2=1")
 else()
     message(FATAL_ERROR "XMR-STAK_COMPILE is set to an unknown value '${XMR-STAK_COMPILE}'")
@@ -496,6 +496,10 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
     set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}")
 endif()
 
+if(XMR-STAK_COMPILE STREQUAL "dev_release")
+    add_definitions(-DXMRSTAK_DEV_RELEASE)
+endif()
+
 # activate static libgcc and libstdc++ linking
 if(CMAKE_LINK_STATIC)
     set(BUILD_SHARED_LIBRARIES OFF)
@@ -586,7 +590,16 @@ if(CUDA_FOUND)
         )
     endif()
 
-    set(CUDA_LIBRARIES ${CUDA_LIB} ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES})
+    set(CUDA_LIBRARIES ${CUDA_LIB} ${CUDA_LIBRARIES})
+    if(XMR-STAK_COMPILE STREQUAL "dev_release")
+        # do not link nvrtc for linux binaries, cn-r will be disabled
+        if(WIN32)
+            set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB})
+        endif()
+    else()
+        set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB})
+    endif()
+    
     target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
     target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
 endif()
diff --git a/README.md b/README.md
index c890da1a5..2e2eb61fa 100644
--- a/README.md
+++ b/README.md
@@ -1,102 +1,10 @@
-###### fireice-uk's and psychocrypt's
-# XMR-Stak - Cryptonight All-in-One Mining Software
-
-XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
-
-## HTML reports
-<img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-hashrate.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-results.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-connection.png" width="260">
-
-## Video setup guide on Windows
-
-[<img src="https://gist.githubusercontent.com/fireice-uk/3621b179d56f57a8ead6303d8e415cf6/raw/f572faba67cc9418116f3c1dfd7783baf52182ce/vidguidetmb.jpg">](https://youtu.be/YNMa8NplWus)
-###### Video by Crypto Sewer
-
-## Overview
-* [Features](#features)
-* [Supported altcoins](#supported-altcoins)
-* [Download](#download)
-* [Usage](doc/usage.md)
-* [HowTo Compile](doc/compile.md)
-* [FAQ](doc/FAQ.md)
-* [Developer Donation](#default-developer-donation)
-* [Developer PGP Key's](doc/pgp_keys.md)
-
-## Features
-
-- support all common backends (CPU/x86, AMD-GPU and NVIDIA-GPU)
-- support all common OS (Linux, Windows and macOS)
-- supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON)
-- easy to use
-  - guided start (no need to edit a config file for the first start)
-  - auto-configuration for each backend
-- open source software (GPLv3)
-- TLS support
-- [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton)
-- [JSON API for monitoring](doc/usage.md#html-and-json-api-report-configuraton)
-
-## Supported altcoins
-
-Besides [Monero](https://getmonero.org), following coins can be mined using this miner:
-
-- [Aeon](http://www.aeon.cash)
-- [BBSCoin](https://www.bbscoin.xyz)
-- [BitTube](https://coin.bit.tube/)
-- [Conceal](https://conceal.network)
-- [Graft](https://www.graft.network)
-- [Haven](https://havenprotocol.com)
-- [Lethean](https://lethean.io)
-- [Masari](https://getmasari.org)
-- [Plenteum](https://www.plenteum.com/)
-- [QRL](https://theqrl.org)
-- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
-- [Stellite](https://stellite.cash/)
-- [TurtleCoin](https://turtlecoin.lol)
-- [Zelerius](https://zelerius.org/)
-- [X-CASH](https://x-network.io/)
-
-Ryo currency is a way for us to implement the ideas that we were unable to in
-Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
-
-If your prefered coin is not listed, you can choose one of the following algorithms:
-- 256Kib scratchpad memory
-    - cryptonight_turtle
-- 1MiB scratchpad memory
-    - cryptonight_lite
-    - cryptonight_lite_v7
-    - cryptonight_lite_v7_xor (algorithm used by ipbc)
-- 2MiB scratchpad memory
-    - cryptonight
-    - cryptonight_gpu (for Ryo's 14th of Feb fork)
-    - cryptonight_masari (used in 2018)
-    - cryptonight_v7
-    - cryptonight_v7_stellite
-    - cryptonight_v8
-    - cryptonight_v8_double (used by X-CASH)
-    - cryptonight_v8_half (used by masari and stellite)
-    - cryptonight_v8_reversewaltz (used by graft)
-    - cryptonight_v8_zelerius
-- 4MiB scratchpad memory
-    - cryptonight_haven
-    - cryptonight_heavy
-
-Please note, this list is not complete and is not an endorsement.
-
-## Download
-
-You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases).
-
-## Default Developer Donation
-
-By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
-
-If you want to donate directly to support further development, here is my wallet
-
-fireice-uk:
-```
-4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk
-```
-
-psychocrypt:
-```
-45tcqnJMgd3VqeTznNotiNj4G9PQoK67TGRiHyj6EYSZ31NUbAfs9XdiU5squmZb717iHJLxZv3KfEw8jCYGL5wa19yrVCn
-```
+<a href="doc/README.md" _target="blank"><img src="doc/_img/gpu.png"></a>
+<a href="#select_coin" _target="blank"><img src="doc/_img/cpu.png"></a>
+<table>
+    <p id="select_coin">
+    <tr>
+        <td align="center"><a href=https://github.com/xmrig/xmrig><img src="doc/_img/xmrig.png"></a></td>
+        <td align="center"><a href=https://ragerx.lol><img src="doc/_img/ragerx.png"></a></td>
+        <td align="center"><a href=https://github.com/fireice-uk/xmr-stak/tree/xmr-stak-rx/doc/README.md><img src="doc/_img/rx.png"></a></td>
+    </tr>
+</table>
\ No newline at end of file
diff --git a/doc/FAQ.md b/doc/FAQ.md
index f744e3d24..b78ac15cb 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -1,104 +1,27 @@
 # FAQ
+To improve our support we created [Xmr-Stak forum](https://www.reddit.com/r/XmrStak). Check it out if you have a problem, or you are looking for most up to date config for your card and [guides](https://www.reddit.com/r/XmrStak/wiki/index).
+
 
 ## Content Overview
-* ["Obtaining SeLockMemoryPrivilege failed."](#obtaining-selockmemoryprivilege-failed)
-* [VirtualAlloc failed](#virtualalloc-failed)
-* [Error msvcp140.dll and vcruntime140.dll not available](#error-msvcp140dll-and-vcruntime140dll-not-available)
-* [Error: MEMORY ALLOC FAILED: mmap failed](#error-memory-alloc-failed-mmap-failed)
-* [Illegal instruction (core dumped)](#illegal-instruction)
 * [Virus Protection Alert](#virus-protection-alert)
 * [Change Currency to Mine](#change-currency-to-mine)
 * [How can I mine Monero](#how-can-i-mine-monero)
 * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed)
-* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error)
-
-## "Obtaining SeLockMemoryPrivilege failed."
-
-For professional versions of Windows see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx).
-Make sure to reboot afterwards!
-
-For Windows 7/10 Home:
-
-1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore any incompatibility warning during installation.
-
-2) Open cmd or PowerShell as an administrator.
-
-3) Use `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` where %USERNAME% is the user that will be running the program.
-
-4) Reboot.
-
-Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791
-
-*Warning: Do not download ntrights.exe from any other site other than the offical Microsoft download page.*
-
-## VirtualAlloc failed
-
-If you set up the user rights properly ([see above](https://github.com/fireice-uk/xmr-stak/blob/master/doc/FAQ.md#selockmemoryprivilege-failed)), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation.
-
-If that happens, disable all auto-starting applications and run the miner after a reboot.
-
-## Error msvcp140.dll and vcruntime140.dll not available
-
-Download and install this [runtime package](https://go.microsoft.com/fwlink/?LinkId=746572) from Microsoft.  *Warning: Do NOT use "missing dll" sites - dll's are exe files with another name, and it is a fairly safe bet that any dll on a shady site like that will be trojaned.  Please download offical runtimes from Microsoft above.*
-
-
-## Error: MEMORY ALLOC FAILED: mmap failed
-
-On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`).
-
-Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time).  Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work).
-
-To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults):
-
-    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
-    vm.nr_hugepages = 0
-    16
-
-To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`:
-
-    vm.nr_hugepages=128
-
-You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot).  In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits)
-
-To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`:
-
-    *    - memlock 262144
-    root - memlock 262144
-
-You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).
-Recheck after completing these steps to validate:
-
-    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
-    vm.nr_hugepages = 128
-    262144
-
-You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.  Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly).
-
-## Illegal Instruction
-
-This typically means you are trying to run it on a CPU that does not have [AES](https://en.wikipedia.org/wiki/AES_instruction_set).  This only happens on older version of miner, new version gives better error message (but still wont' work since your CPU doesn't support the required instructions).
-
-## Virus Protection Alert
 
+### Virus Protection Alert
 Some virus protection software flags the miner binary as *malware*. This is a false positive — the software does not contain any malware (and since it is open source, you can verify that yourself!)
 If your antivirus software flags **xmr-stak**, it will likely move it to its quarantine area. You may have to whitelist **xmr-stak** in your antivirus.
 
-## Change Currency to Mine
-
+### Change Currency to Mine
 If the miner is compiled for Monero and Aeon than you can change
  - the value `currency` in the config *or*
  - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7`
  - run `xmr-stak --help` to see all supported currencies and algorithms
 
-## How can I mine Monero
-
+### How can I mine Monero
 Set the value `currency` in `pools.txt` to `monero`.
 
-## Which currency must be chosen if my fork coin is not listed
-
+### Which currency must be chosen if my fork coin is not listed
 If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm.
 Select one of these generic coin algorithms.
 
-## Internal compiler error
-
-Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04).
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 000000000..7a1f13288
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,132 @@
+
+<table>
+    <tr>
+        <td align="center"><a href=https://github.com/fireice-uk/xmr-stak/tree/xmr-stak-rx/doc/README.md><img src="_img/xmr-stak-rx-btn-inactive.png"></a></td>
+        <td align="center"><a href=#><img src="_img/xmr-stak-btn-active.png"></a></td>
+        <td align="center"><a href=https://ragerx.lol><img src="_img/ragerx-btn.png"></a></td>
+    </tr>
+</table>
+
+<table>
+    <tr>
+        <td align="center"><a href=#features-overview><img src="_img/menu-features-green.png"></a></td>
+        <td align="center"><a href=#supported-coins-and-algorithms><img src="_img/menu-supported-coins-green.png"></a></td>
+        <td align="center"><a href=#get-miner><img src="_img/menu-get-miner-green.png"></a></td>
+        <td align="center"><a href=#additional-guides-and-feedback><img src="_img/menu-support-green.png"></a></td>
+        <td align="center"><a href=#default-developer-donation><img src="_img/menu-donations-green.png"></a></td>
+    </tr>
+</table>
+
+ <table>
+     <tr>
+         <td align="center"><a href=usage.md><img src="_img/usage-green.png"></a></td>
+         <td align="center"><a href=compile/compile.md><img src="_img/how-to-compile-green.png"></a></td>
+         <td align="center"><a href=tuning.md><img src="_img/fine-tuning-green.png"></a></td>
+         <td align="center"><a href=troubleshooting.md><img src="_img/troubleshooting-green.png"></a></td>
+         <td align="center"><a href=FAQ.md><img src="_img/faq-green.png"></a></td>
+     </tr>
+ </table>
+
+## Introduction
+XMR-Stak is a universal open source stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used for mining various crypto currencies: Ryo, Graft, Bittube, Conceal, Haven and many more Cryptonight coins.
+
+## Features overview
+[<img src="_img/features-xmr-stak.png">](#)
+
+## Supported coins and algorithms
+Xmr-Stak supports various variants of Cryptonight algorithm. Use one of the following options (type this coin alias in either `pool.txt` config file or on startup configuration under `"currency"` parameter and miner will pick it's variant of Cryptonight algorithm for mining):
+
+|  |  |  |
+| ---  | ---  | --- |
+| [BitTube](https://coin.bit.tube/) | [Plenteum](https://www.plenteum.com/) |  |
+| [Conceal](https://conceal.network) | [QRL](https://theqrl.org) |  |
+| [Graft](https://www.graft.network) | [Ryo](https://ryo-currency.com)  | **Atom Wallet Solo mining mode is sponsored by [RYO](https://ryo-currency.com/)** |
+| [Haven](https://havenprotocol.com) | [X-CASH](https://x-network.io/) |  |
+| [Lethean](https://lethean.io) | [Zelerius](https://zelerius.org/) |  |
+| [Masari](https://getmasari.org) |  |  |
+
+
+**[Ryo Currency](https://ryo-currency.com)** - is a way for us to implement the ideas that we were unable to in
+Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
+
+If your preferred coin is not listed, you can choose one of the following mining algorithms:
+
+| 256 KiB scratchpad memory | 1 MiB scratchpad memory | 2 MiB scratchpad memory | 4 MiB scratchpad memory |
+| --- | --- | --- | --- | 
+| cryptonight_turtle  | cryptonight_lite  | cryptonight  | cryptonight_bittube2  | 
+| ---  | cryptonight_lite_v7  | cryptonight_gpu  | cryptonight_haven  | 
+| ---  | ---  | cryptonight_conceal  | cryptonight_heavy  | 
+| ---  | ---  | cryptonight_r  | ---  | 
+| ---  | ---  | cryptonight_masari (used in 2018)  | ---  | 
+| ---  | ---  | cryptonight_v8_reversewaltz  | ---  | 
+| ---  | ---  | cryptonight_v7  | ---  | 
+| ---  | ---  | cryptonight_v8  | ---  | 
+| ---  | ---  | cryptonight_v8_half (used by masari)  | ---  | 
+| ---  | ---  | cryptonight_v8_double (used by X-CASH)  | ---  | 
+| ---  | ---  | cryptonight_v8_zelerius  | ---  | 
+
+Please note, this list is not complete and is not an endorsement.
+
+
+## Get Miner
+Please note that code is developed on the [dev branch](https://github.com/fireice-uk/xmr-stak/commits/dev), if you want to check out the latest updates, before they are merged on main branch, please refer there. Master branch will always point to a version that we consider stable, so you can download the code by simply typing `git clone https://github.com/fireice-uk/xmr-stak.git`  
+
+Also you can find the latest releases and precompiled binaries on GitHub under [releases](https://github.com/fireice-uk/xmr-stak/releases/latest) section.
+
+If you want to compile the miner from source files, navigate to ["how to compile"](compile/compile.md) section of docs or [xmr-stak forum](https://www.reddit.com/r/XmrStak/wiki/guides/startup) where you will find the latest step-by-step instructions.
+
+
+## Start Mining
+Miner has 2 ways of initial configuring: simple and advanced. The simple method will prompt user with minimum information. Required answers are y , (or yes), n , (or no):
+
+#### Simple setup:
+* `Use simple setup method?` y    
+* `Please enter the currency that you want to mine:` Enter currency or mining algorithm  
+* `Enter pool address (pool address:port):` Enter pool connection address:port  
+* `Username (wallet address or pool login):` Enter wallet address
+* `Password (mostly empty or x):` press Enter  
+* `Does this pool port support TLS/SSL? Use no if unknown. (y/N):` press y or n  
+
+#### Advanced setup:
+* `Use simple setup method?` n  
+* `Do you want to use the HTTP interface? Unlike the screen display, browser interface is not affected by the GPU lag. If you don't want to use it, please enter 0, otherwise enter port number that the miner should listen on` 5656
+* `Please enter the currency that you want to mine:` Enter currency or mining algorithm
+* `Enter pool address (pool address:port):` Enter pool connection address:port 
+* `Username (wallet address or pool login):` Enter wallet address
+* `Password (mostly empty or x):` press Enter
+* `Rig identifier for pool-side statistics (needs pool support). Can be empty:` Enter rig name or press Enter
+* `Does this pool port support TLS/SSL? Use no if unknown. (y/N)` Enter y or n
+* `Do you want to use nicehash on this pool? (y/N)` n
+* `Do you want to use multiple pools? (y/N)` Enter y if you want to se up backup pool or n
+
+
+## Additional Guides and Feedback
+[<img src="_img/stak-yt-cover.jpg">](https://www.youtube.com/c/xmrstak)
+###### Video by Crypto Sewer
+
+To improve our support we created [Xmr-Stak forum](https://www.reddit.com/r/XmrStak). Check it out if you have a problem, or you are looking for most up to date config for your card and [guides](https://www.reddit.com/r/XmrStak/wiki/index).
+
+ <table>
+     <tr>
+         <td align="center"><a href=usage.md><img src="_img/usage-green.png"></a></td>
+         <td align="center"><a href=compile/compile.md><img src="_img/how-to-compile-green.png"></a></td>
+         <td align="center"><a href=tuning.md><img src="_img/fine-tuning-green.png"></a></td>
+         <td align="center"><a href=troubleshooting.md><img src="_img/troubleshooting-green.png"></a></td>
+         <td align="center"><a href=FAQ.md><img src="_img/faq-green.png"></a></td>
+     </tr>
+ </table>
+
+## Default Developer Donation
+By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
+
+If you want to donate directly to support further development, here is my wallet
+
+fireice-uk:
+```
+4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk
+```
+
+psychocrypt:
+```
+45tcqnJMgd3VqeTznNotiNj4G9PQoK67TGRiHyj6EYSZ31NUbAfs9XdiU5squmZb717iHJLxZv3KfEw8jCYGL5wa19yrVCn
+```
\ No newline at end of file
diff --git a/doc/_img/2ragerx-btn.png b/doc/_img/2ragerx-btn.png
new file mode 100644
index 000000000..1c0edd98c
Binary files /dev/null and b/doc/_img/2ragerx-btn.png differ
diff --git a/doc/_img/2xmr-stak-btn.png b/doc/_img/2xmr-stak-btn.png
new file mode 100644
index 000000000..7626e27c1
Binary files /dev/null and b/doc/_img/2xmr-stak-btn.png differ
diff --git a/doc/_img/YT.png b/doc/_img/YT.png
new file mode 100644
index 000000000..cf7a869a2
Binary files /dev/null and b/doc/_img/YT.png differ
diff --git a/doc/_img/cpu.png b/doc/_img/cpu.png
new file mode 100644
index 000000000..6a370fbc9
Binary files /dev/null and b/doc/_img/cpu.png differ
diff --git a/doc/_img/faq-green.png b/doc/_img/faq-green.png
new file mode 100644
index 000000000..440a855b2
Binary files /dev/null and b/doc/_img/faq-green.png differ
diff --git a/doc/_img/faq.png b/doc/_img/faq.png
new file mode 100644
index 000000000..83167e3c7
Binary files /dev/null and b/doc/_img/faq.png differ
diff --git a/doc/_img/features-xmr-stak.png b/doc/_img/features-xmr-stak.png
new file mode 100644
index 000000000..ef75a3b14
Binary files /dev/null and b/doc/_img/features-xmr-stak.png differ
diff --git a/doc/_img/features.png b/doc/_img/features.png
new file mode 100644
index 000000000..37c877291
Binary files /dev/null and b/doc/_img/features.png differ
diff --git a/doc/_img/fee.png b/doc/_img/fee.png
new file mode 100644
index 000000000..cd3cdaf00
Binary files /dev/null and b/doc/_img/fee.png differ
diff --git a/doc/_img/fine-tuning-green.png b/doc/_img/fine-tuning-green.png
new file mode 100644
index 000000000..b58184bfa
Binary files /dev/null and b/doc/_img/fine-tuning-green.png differ
diff --git a/doc/_img/fine-tuning.png b/doc/_img/fine-tuning.png
new file mode 100644
index 000000000..6b817cffe
Binary files /dev/null and b/doc/_img/fine-tuning.png differ
diff --git a/doc/_img/gpu.png b/doc/_img/gpu.png
new file mode 100644
index 000000000..4d5578007
Binary files /dev/null and b/doc/_img/gpu.png differ
diff --git a/doc/_img/header.png b/doc/_img/header.png
new file mode 100644
index 000000000..8c9eeefad
Binary files /dev/null and b/doc/_img/header.png differ
diff --git a/doc/_img/how-to-compile-green.png b/doc/_img/how-to-compile-green.png
new file mode 100644
index 000000000..e82c8b693
Binary files /dev/null and b/doc/_img/how-to-compile-green.png differ
diff --git a/doc/_img/how-to-compile.png b/doc/_img/how-to-compile.png
new file mode 100644
index 000000000..a54603484
Binary files /dev/null and b/doc/_img/how-to-compile.png differ
diff --git a/doc/_img/html_reports.png b/doc/_img/html_reports.png
new file mode 100644
index 000000000..2d17bc1bf
Binary files /dev/null and b/doc/_img/html_reports.png differ
diff --git a/doc/img/interleave.png b/doc/_img/interleave.png
similarity index 100%
rename from doc/img/interleave.png
rename to doc/_img/interleave.png
diff --git a/doc/_img/menu-donations-green.png b/doc/_img/menu-donations-green.png
new file mode 100644
index 000000000..a299980d3
Binary files /dev/null and b/doc/_img/menu-donations-green.png differ
diff --git a/doc/_img/menu-donations.png b/doc/_img/menu-donations.png
new file mode 100644
index 000000000..f73facf6f
Binary files /dev/null and b/doc/_img/menu-donations.png differ
diff --git a/doc/_img/menu-features-green.png b/doc/_img/menu-features-green.png
new file mode 100644
index 000000000..527d68d4c
Binary files /dev/null and b/doc/_img/menu-features-green.png differ
diff --git a/doc/_img/menu-features.png b/doc/_img/menu-features.png
new file mode 100644
index 000000000..bcf71064d
Binary files /dev/null and b/doc/_img/menu-features.png differ
diff --git a/doc/_img/menu-get-miner-green.png b/doc/_img/menu-get-miner-green.png
new file mode 100644
index 000000000..9e3bd5753
Binary files /dev/null and b/doc/_img/menu-get-miner-green.png differ
diff --git a/doc/_img/menu-get-miner.png b/doc/_img/menu-get-miner.png
new file mode 100644
index 000000000..891a35f16
Binary files /dev/null and b/doc/_img/menu-get-miner.png differ
diff --git a/doc/_img/menu-support-green.png b/doc/_img/menu-support-green.png
new file mode 100644
index 000000000..3db8e76ef
Binary files /dev/null and b/doc/_img/menu-support-green.png differ
diff --git a/doc/_img/menu-support.png b/doc/_img/menu-support.png
new file mode 100644
index 000000000..5cd80e42f
Binary files /dev/null and b/doc/_img/menu-support.png differ
diff --git a/doc/_img/menu-supported-coins-green.png b/doc/_img/menu-supported-coins-green.png
new file mode 100644
index 000000000..8678ea444
Binary files /dev/null and b/doc/_img/menu-supported-coins-green.png differ
diff --git a/doc/_img/menu-supported-coins.png b/doc/_img/menu-supported-coins.png
new file mode 100644
index 000000000..aabc37283
Binary files /dev/null and b/doc/_img/menu-supported-coins.png differ
diff --git a/doc/_img/ragerx-btn.png b/doc/_img/ragerx-btn.png
new file mode 100644
index 000000000..d08e245fc
Binary files /dev/null and b/doc/_img/ragerx-btn.png differ
diff --git a/doc/_img/ragerx.png b/doc/_img/ragerx.png
new file mode 100644
index 000000000..bc2453d2a
Binary files /dev/null and b/doc/_img/ragerx.png differ
diff --git a/doc/_img/rx.png b/doc/_img/rx.png
new file mode 100644
index 000000000..d9c4c3dfa
Binary files /dev/null and b/doc/_img/rx.png differ
diff --git a/doc/_img/split.png b/doc/_img/split.png
new file mode 100644
index 000000000..11a8635b9
Binary files /dev/null and b/doc/_img/split.png differ
diff --git a/doc/_img/stak-yt-cover.jpg b/doc/_img/stak-yt-cover.jpg
new file mode 100644
index 000000000..ff21acebf
Binary files /dev/null and b/doc/_img/stak-yt-cover.jpg differ
diff --git a/doc/_img/troubleshooting-green.png b/doc/_img/troubleshooting-green.png
new file mode 100644
index 000000000..d36cec8b8
Binary files /dev/null and b/doc/_img/troubleshooting-green.png differ
diff --git a/doc/_img/troubleshooting.png b/doc/_img/troubleshooting.png
new file mode 100644
index 000000000..e57eda740
Binary files /dev/null and b/doc/_img/troubleshooting.png differ
diff --git a/doc/_img/usage-green.png b/doc/_img/usage-green.png
new file mode 100644
index 000000000..c60b9a432
Binary files /dev/null and b/doc/_img/usage-green.png differ
diff --git a/doc/_img/usage.png b/doc/_img/usage.png
new file mode 100644
index 000000000..d9421ba66
Binary files /dev/null and b/doc/_img/usage.png differ
diff --git a/doc/_img/xmr-stak-btn-active.png b/doc/_img/xmr-stak-btn-active.png
new file mode 100644
index 000000000..68520be91
Binary files /dev/null and b/doc/_img/xmr-stak-btn-active.png differ
diff --git a/doc/_img/xmr-stak-btn.png b/doc/_img/xmr-stak-btn.png
new file mode 100644
index 000000000..0356f41aa
Binary files /dev/null and b/doc/_img/xmr-stak-btn.png differ
diff --git a/doc/_img/xmr-stak-cpu-connection.png b/doc/_img/xmr-stak-cpu-connection.png
new file mode 100644
index 000000000..d07a8d0a9
Binary files /dev/null and b/doc/_img/xmr-stak-cpu-connection.png differ
diff --git a/doc/_img/xmr-stak-cpu-hashrate.png b/doc/_img/xmr-stak-cpu-hashrate.png
new file mode 100644
index 000000000..488a34825
Binary files /dev/null and b/doc/_img/xmr-stak-cpu-hashrate.png differ
diff --git a/doc/_img/xmr-stak-cpu-results.png b/doc/_img/xmr-stak-cpu-results.png
new file mode 100644
index 000000000..7244f9579
Binary files /dev/null and b/doc/_img/xmr-stak-cpu-results.png differ
diff --git a/doc/_img/xmr-stak-rx-btn-inactive.png b/doc/_img/xmr-stak-rx-btn-inactive.png
new file mode 100644
index 000000000..1644a9505
Binary files /dev/null and b/doc/_img/xmr-stak-rx-btn-inactive.png differ
diff --git a/doc/_img/xmr-stak-rx-btn.png b/doc/_img/xmr-stak-rx-btn.png
new file mode 100644
index 000000000..39f0c87f7
Binary files /dev/null and b/doc/_img/xmr-stak-rx-btn.png differ
diff --git a/doc/_img/xmrig.png b/doc/_img/xmrig.png
new file mode 100644
index 000000000..cdeaa4501
Binary files /dev/null and b/doc/_img/xmrig.png differ
diff --git a/doc/compile.md b/doc/compile/compile.md
similarity index 100%
rename from doc/compile.md
rename to doc/compile/compile.md
diff --git a/doc/compile_FreeBSD.md b/doc/compile/compile_FreeBSD.md
similarity index 100%
rename from doc/compile_FreeBSD.md
rename to doc/compile/compile_FreeBSD.md
diff --git a/doc/compile_Linux.md b/doc/compile/compile_Linux.md
similarity index 100%
rename from doc/compile_Linux.md
rename to doc/compile/compile_Linux.md
diff --git a/doc/compile_Windows.md b/doc/compile/compile_Windows.md
similarity index 92%
rename from doc/compile_Windows.md
rename to doc/compile/compile_Windows.md
index 64d68bab1..37925576a 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile/compile_Windows.md
@@ -111,6 +111,15 @@ Do not follow old information that you need the AMD APP SDK. AMD has removed the
 
   cd bin\Release
 
+  copy C:\xmr-stak-dep\openssl\bin\* .
+  ```
+- For Exclude some of dependence you can follow the command below to set the ENABLE to OFF
+  ```
+  make -G "Visual Studio 15 2017 Win64" -T v141,host=x64 -DCMAKE_BUILD_TYPE=Release -DMICROHTTPD_ENABLE=OFF -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF ..
+  cmake --build . --config Release --target clean
+  cmake --build . --config Release --target install
+  cd bin\Release
+
   copy C:\xmr-stak-dep\openssl\bin\* .
   ```
 - Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD OCL-SDK_light is installed) and CPUs.
diff --git a/doc/compile_macOS.md b/doc/compile/compile_macOS.md
similarity index 100%
rename from doc/compile_macOS.md
rename to doc/compile/compile_macOS.md
diff --git a/doc/troubleshooting.md b/doc/troubleshooting.md
new file mode 100644
index 000000000..fb0dc88ce
--- /dev/null
+++ b/doc/troubleshooting.md
@@ -0,0 +1,119 @@
+# Troubleshooting
+To improve our support we created [Xmr-Stak forum](https://www.reddit.com/r/XmrStak). Check it out if you have a problem, or you are looking for most up to date config for your card and [guides](https://www.reddit.com/r/XmrStak/wiki/index).
+
+
+### 1. CL_MEM_OBJECT_ALLOCATION_FAILURE when calling clEnqueue
+This error means that GPU can't allocate the requested amount of memory that is specified by your config. There is 2 known solutions of this problem:
+
+* Check if you occasionally use too many threads per one GPU (check *index* value in amd.txt)
+* You set too high `intensity` value in amd.txt - try to reduce it to lower values (multiple to `worksize`)
+* If you are using Windows - you may have not enough virtual memory in system. Add virtual memory (don't be afraid if it goes up to 60gb per 6 GPU rig)
+
+ 
+
+### 2. GPU is not detected
+Check if you have antivirus software turned on. If yes - it could delete some .dll files (for example  xmrstak\_cuda\_backend\_cuda10\_0.dll)
+
+ 
+
+### 3. Illegal Instruction
+This typically means you are trying to run it on a CPU that does not have [AES](https://en.wikipedia.org/wiki/AES_instruction_set). This only happens on older version of miner, new version gives better error message (but still wont' work since your CPU doesn't support the required instructions).
+
+ 
+
+### 4.  Internal compiler error
+Seeing  `g++: internal compiler error: Killed (program cc1plus)`is probably related to not enough RAM to compile. 1 Gb RAM should be enough (on clean Ubuntu 16.04).
+
+ 
+
+### 5. Invalid Result GPU ID
+This error can be caused by several reasons, here is most common, known successful practices how to fix it:
+
+* **Hardware problem: overclock/overvoltage/undervoltage** \- try to use stock clocks and voltages.
+* **Software problem: drivers** \- try to change driver versions (for AMD gpu most commonly stable versions are: blockchain drivers or 18.6.1)
+* **Miner misconfiguration** \- try to reduce `intensity` (if AMD) or `threads` or `bfactor` (if NVIDIA) in config file.
+
+If you still receive these errors, [report please the issue](https://github.com/fireice-uk/xmr-stak/issues).
+
+ 
+### 6. IP is banned
+Pool has banned your IP, This can be caused by several reasons:
+
+* You selected wrong pool port or the static diff is too low. (Learn more about [pool ports and diff](https://www.reddit.com/r/XmrStak/wiki/guides/other-questions#wiki_1._pool_ports_and_difficulty))
+* You had too many [invalid shares \[8\]](https://www.reddit.com/r/XmrStak/wiki/troubleshooting#wiki_8._invalid_result_gpu_id)
+
+ 
+
+### 7. MEMORY ALLOC FAILED: mmap failed
+On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`).
+
+Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf`  as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time). Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work). To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run xmr-stak  as (example shows bad/low sample defaults):
+
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l vm.nr_hugepages = 0 16 
+
+To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`:
+
+    vm.nr_hugepages=128 
+
+You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot). In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits)
+
+To increase the memlock (`ulimit -l`), add following lines to `/etc/security/limits.d/60-memlock.conf`:
+
+    *    - memlock 262144 root - memlock 262144 
+
+You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session). Recheck after completing these steps to validate:
+
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l vm.nr_hugepages = 128 262144 
+
+You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly).
+
+
+### 8. msvcp140.dll and vcruntime140.dll are not available
+Download and install this [runtime package](https://go.microsoft.com/fwlink/?LinkId=746572) from Microsoft.
+
+>***Warning***\*: Do NOT use "missing dll" sites - dll's are exe files with another name, and it is a fairly safe bet that any dll on a shady site like that will be trojaned. Please download offical runtimes from Microsoft above.\*
+
+ 
+
+###9. Obtaining SeLockMemoryPrivilege failed.
+For professional versions of Windows see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx). Make sure to reboot afterwards!
+
+**For Windows 7/10 Home:**
+
+1. Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore any incompatibility warning during installation.
+2. Open cmd or PowerShell as an administrator.
+3. `Use ntrights -u %USERNAME% +r SeLockMemoryPrivilege`where `%USERNAME%` is the user that will be running the program.
+4. Reboot.
+
+Reference: [http://rybkaforum.net/cgi-bin/rybkaforum/topic\_show.pl?pid=259791#pid259791](http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791)
+
+*Warning: Do not download ntrights.exe from any other site other than the offical Microsoft download page.*
+
+
+### 10. Share rejected - Low diff share
+Check if a coin that you are mining has changed algorithm in one of its forks and you use right hashing algorithm in pools.txt (parameter: `currency`).
+
+ 
+
+### 11. VirtualAlloc failed
+If you set up the user rights properly ([see issue #7](https://www.reddit.com/r/XmrStak/wiki/troubleshooting#wiki_7._memory_alloc_failed.3A_mmap_failed)), and your system has 4-8GB of RAM (and 50%+ is in use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation.
+
+If that happens, disable all auto-starting applications and run the miner after a reboot.
+
+ 
+### 12. (Ubuntu compiling) - Nvidia insufficient driver
+If you have this error after compiling xmr-stak in Ubuntu - make sure you have the latest drivers and not X.org.X Nouveau or v390. Install them manually or with [cuda package](https://www.reddit.com/r/XmrStak/wiki/guides/startup#wiki_2._ubuntu_18.10_setup_.2B_nvidia_.28compiling_from_source.29)
+
+ 
+
+### 13. (Ubuntu compiling) - Could NOT find OpenCL (missing: OpenCL_LIBRARY OpenCL_INCLUDE_DIR) Cmake error at CmakeLists.txt
+When [compiling in Ubuntu with Nvidia](https://www.reddit.com/r/XmrStak/wiki/guides/startup#wiki_2._ubuntu_18.10_setup_.2B_nvidia_.28compiling_from_source.29) devices, and running `cmake ..` command add additional param that disables OpenCL:  `cmake .. -DOpenCL_ENABLE=OFF` 
+
+ 
+
+### 14. (Ubuntu compiling) - gcc v8 is not supported
+Cuda 10 ships with gcc and g++ ver.8 which is not supported. Make sure you [set gcc and g++ to v6](https://www.reddit.com/r/XmrStak/wiki/guides/startup#wiki_2.2_compiling) before compiling. (step 2.2.6)
+
+
+
+
diff --git a/doc/tuning.md b/doc/tuning.md
index 6d07d4ddc..a504b85ef 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -3,41 +3,59 @@
 ## Content Overview
 * [Benchmark](#benchmark)
 * [Windows](#windows)
+* [Managing GPUs](#managing-GPUs)
 * [NVIDIA Backend](#nvidia-backend)
   * [Choose Value for `threads` and `blocks`](#choose-value-for-threads-and-blocks)
   * [Add more GPUs](#add-more-gpus)
 * [AMD Backend](#amd-backend)
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
-  * [Add more GPUs](#add-more-gpus)
-  * [Two Threads per GPU](two-threads-per-gpu)
-  * [Interleave Tuning](interleave-tuning )
+  * [Two Threads per GPU](two-threads-per-GPU)
+  * [Interleave Tuning](interleave-tuning)
   * [disable comp_mode](#disable-comp_mode)
-  * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
+  * [Auto-tune](#auto-tune)
+  * [Change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
   * [Increase Memory Pool](#increase-memory-pool)
   * [Scratchpad Indexing](#scratchpad-indexing)
 * [CPU Backend](#cpu-backend)
   * [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode)
 
 ## Benchmark
-To benchmark the miner speed there are two ways.
-  - Mine against a pool end press the key `h` after 30 sec to see the hash report.
-  - Start the miner with the cli option `--benchmark BLOCKVERSION`. The miner will not connect to any pool and performs a 60sec performance benchmark with all enabled back-ends.
+You can benchmark the miner in two ways:
+  - Edit `config.txt` and set `verbose_level` to 4 and `h_print_time` to 30 and start the miner. You will see hash report each 30 seconds.
+  - Start the miner with the cli option `--benchmark BLOCKVERSION`. The miner will not connect to any pool and performs a 60sec performance benchmark with all enabled backends.
 
 ## Windows
 "Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7.
 On Windows 10 it is only needed once to set up the account to use them.
 Disable the dialog with the command line option `--noUAC`
 
+### Managing GPUs
+
+To turn on and off a GPU you need to add/remove config set to `GPU_threads_conf`.
+`index` is the number of the GPU, the index order not follow the order from `nvidia-smi` or the order shown in windows.
+
+```
+"GPU_threads_conf" :
+[
+    { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
+    },
+    { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
+    },
+],
+```
+
 ## NVIDIA Backend
 
 By default the NVIDIA backend can be tuned in the config file `nvidia.txt`
 
 ### Choose Value for `threads` and `blocks`
 
-The optimal parameter for the `threads` and `blocks` option in `config.txt` depend on your GPU.
-For all GPU's with a compute capability `>=2.0` and `<6.0` there is a restriction of the amount of RAM that can be used for the mining algorithm.
-The maximum RAM that can be used must be less than 2GB (e.g. GTX TITAN) or 1GB (e.g. GTX 750-TI).
-The amount of RAM used for mining can be changed with `"threads" : T, "blocks : B"`.
+The optimal values for the `threads` and `blocks` parameters in `nvidia.txt` depend on your GPU model and selected mining algorithm.
+For all GPU's with a compute capability `>=2.0` and `<6.0` there is a restriction of the amount of vRAM that can be used for the mining algorithm.
+The maximum vRAM that can be used must be less than 2GB (e.g. GTX TITAN) or 1GB (e.g. GTX 750-TI).
+The amount of vRAM used for mining can be changed with `"threads" : T, "blocks : B"`.
   - `T` = threads used per block
   - `B` = CUDA blocks started (should be a multiple of the multiprocessors `M` on the GPU)
 
@@ -48,23 +66,6 @@ and full fill all restrictions `16 * 48 * 2 = 1536` and `48 mod 24 = 0`.
 
 The memory limit for NVIDIA Pascal GPUs is `16` GiB if the newest CUDA driver is used.
 
-### Add More GPUs
-
-To add a new GPU you need to add a new config set to `gpu_threads_conf`.
-`index` is the number of the gpu, the index order not follow the order from `nvidia-smi` or the order shown in windows.
-
-```
-"gpu_threads_conf" :
-[
-    { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
-      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
-    },
-    { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
-      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
-    },
-],
-```
-
 ## AMD Backend
 
 By default the AMD backend can be tuned in the config file `amd.txt`
@@ -75,38 +76,16 @@ Intensity means the number of threads used to mine. The maximum intensity is GPU
 `worksize` is the number of threads working together to increase the miner performance.
 In the most cases a `worksize` of `16` or `8` is optimal.
 
-### Add More GPUs
-
-To add a new GPU you need to add a new config set to `gpu_threads_conf`. `index` is the OpenCL index of the gpu.
-`platform_index`is the index of the OpenCL platform (Intel / AMD / Nvidia).
-If you are unsure of either GPU or platform index value, you can use `clinfo` tool that comes with AMD APP SDK to dump the values.
-
-```
-"gpu_threads_conf" :
-[
-    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
-      "interleave" : 40
-    },
-    { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
-      "interleave" : 40
-    },
-],
-
-"platform_index" : 0,
-```
 
 ### Two Threads per GPU
 
-Some GPUs like AMD Vega can mine faster if two threads are using the same GPU.
-Use the auto generated config as base and repeat the config entry for a GPU.
-If the attribute `index` is used twice than two threads will use one GPU.
-Take care that the required memory usage on the GPU will also double.
-Therefore adjust your intensity by hand.
+Some AMD GPUs can mine faster on some mining algorithms if two threads are using the same GPU.
+If you have `amd.txt` config with one `index` entry per GPU - duplicate these entries to run 2 threads per GPU.
+*Notice*:  Keep in mind that the memory usage on the GPU will also double - therefore adjust your `intensity` by hand.
 
+Example of 2-threaded config:
 ```
-"gpu_threads_conf" :
+"GPU_threads_conf" :
 [
     { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
       "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
@@ -123,43 +102,88 @@ Therefore adjust your intensity by hand.
 
 ### Interleave Tuning
 
-Interleave controls when a worker thread is starting to calculate a bunch of hashes 
-if two worker threads are used to utilize one GPU.
-This option has no effect if only one worker thread is used per GPU.
+**Note 1:** Interleaving is available for AMD GPUs only.
 
-![Interleave](img/interleave.png) 
+**Note 2** Interleaving has effect only when 2+ threads are used per each GPU.
 
-Interleave defines how long a thread needs to wait to start the next hash calculation relative to the last started worker thread.
-To choose a interleave value larger than 50% makes no sense because than the gpu will not be utilized well enough.
-In the most cases the default 40 is a good value but on some systems e.g. Linux Rocm 1.9.1 driver with RX5XX you need to adjust the value.
-If you get many interleave message in a row (over 1 minute) you should adjust the value.
+Interleave controls when a worker thread is starting to calculate hashes if two worker threads are used to utilize one GPU. This parameter is designed to reduce total idle periods of GPU while mining
 
-```
-OpenCL Interleave 0|1: 642/2400.50 ms - 30.1
-OpenCL Interleave 0|0: 355/2265.05 ms - 30.2
-OpenCL Interleave 0|1: 221/2215.65 ms - 30.2
-```
+![Interleave](_img/interleave.png) 
 
-description:
-```
-<gpu id>|<thread id on the gpu>: <last delay>/<average calculation per hash bunch> ms - <interleave value>
+**1.Reading and understanding the log:**
 
-```
-`last delay` should gou slowly to 0.
-If it goes down and than jumps to a very large value multiple times within a minute you should reduce the intensity by 5.
-The `intensity value` will automatically go up and down within the range of +-5% to adjust kernel run-time fluctuations.
-Automatic adjustment is disabled as long as `auto-tuning` is active and will be started after it is finished. 
-If `last delay` goes down to 10ms and the messages stops and repeated from time to time with delays up to 15ms you will have already a good value.
+`OpenCL Interleave 0|0: 265/1372.30 ms - 40.1`  
+`OpenCL Interleave 0|1: 125/1330.10 ms - 40.2`  
+`OpenCL Interleave 0|0: 74/1323.67 ms - 40.2`  
+`OpenCL Interleave 0|1: 43/1312.01 ms - 40.2`  
+`OpenCL Interleave 0|1: 16/1283.20 ms - 40.2`  
+
+Reads as:
+`OpenCL Interleave GPU ID|Thread ID: last delay/average calculation time per hash bunch - interleave value`
+
+
+**2.Do I need to adjust it?**  
+In general, interleaving can be used as representation how 2-threading works with your GPU at current set of settings (including GPU power profile, miner settings, drivers). And default value `"interleave" : 40` in `amd.txt` works good in most cases.  
+
+2.1 Optimal setup: After you started mining you have `last delay` value reduced over time to minimum possible value and stays at it. The best scenario is when `last delay` value settled around 10-15 and interleave messages appear rarely. The reported hashrate will be close to max. of GPU capabilities.  
+
+2.2 Not optimal setup: After you started mining you have `last delay` value reducing over time and jumping back to high values, or rising after the start of mining. The reported hashrate will be lower compared to max. possible.  
+
+**3.Adjusting Interleaving and optimizing hashrate**  
+**Note:** setting `interleave` value in amd.txt higher than 50 has no practical sense
+
+If you faced situation described in 2.2 then you need to keep in mind that this can be caused by several possible reasons, so treat them accordingly and start miner after each attempt and check logs and hashrate:
+
+- Miner misconfiguration 1: Adjust "interleave" in amd.txt by couple points +/-  
+- Miner misconfiguration 2: Adjust "intensity" in amd.txt by setting lower value (multiple to "worksize" value)  
+- GPU overclock: Reduce overclock/overvoltage values of GPU memory and GPU core  
+- Drivers issue: Try [reinstalling your drivers](https://www.amd.com/en/support) (there are 3 possible options to try: blockchain drivers, v18.6.1, or newest version)
+​
 
 ### disable comp_mode
 
 `comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices.
 To avoid miner crashes the `intensity` should be a multiple of `worksize` if `comp_mode` is `false`.
 
-### change the scratchpad memory pattern
+### Auto tune
+
+**Note:** This feature is available for AMD gpus only.
+
+Auto-tuning feature may help you to speed up seek process of finding optimal intensity for your GPU (vs manual check, in case if you want to compare autogenerated intensity with the most performing value).
+
+When set, miner will perform several (defined by user) rounds per each intensity check of given range. When setting number of rounds - keep in mind that you want to have a balance of speed and reliability of the checking.
+
+After setting number of checks per intensity value, you will need to set ceiling value after which the miner will stop checking intensity values.
+
+**1.Enabling and configuring auto-tune**  
+Navigate to amd.txt config file in miner's folder, find (in the bottom part) parameter "auto_tune" : 0, and set it to "auto_tune" : 6, (6-10 rounds per intensity value suits most cases.)
+Set autogenerated value of "intensity" : X, for each thread in amd.txt to slightly higher level (e.g. from 890 to 1000)
+Start xmr-stak.exe
+
+**2. Reading and understanding the log**  
+Here is an example of log for 1 GPU with 2 threads (your values will vary):
+`OpenCL 0|0: auto-tune validate intensity 848|840`  
+`OpenCL 0|1: auto-tune validate intensity 848|840`  
+`OpenCL 0|0: auto-tune validate intensity 856|848`  
+`OpenCL 0|1: auto-tune validate intensity 856|848`  
+Reads as: `OpenCL GPU ID|Thread ID auto-tune validate intensity Currently checked value|last succesfully checked value`
+
+After the checking, you will see
+
+`OpenCL 0|0: lock intensity at 896`  
+`OpenCL 0|1: lock intensity at 896`  
+Write down these locked intensity values and stop miner.
+
+**3. Finalizing setup**  
+Set "auto_tune" value (step 1.1) in `amd.txt` back to "auto_tune" : 0,  
+Enter locked intensity values from step 2.  
+Start miner.  
+
+### Change the scratchpad memory pattern
 
 By changing `strided_index` to `2` the number of contiguous elements (a 16 byte) for one miner thread can be fine tuned with the option `mem_chunk`.
 
+
 ### Increase Memory Pool
 
 By setting the following environment variables before the miner is started OpenCl allows the miner to more threads.
diff --git a/doc/usage.md b/doc/usage.md
index 82d26dcc5..800ff6949 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -1,9 +1,9 @@
-# HowTo Use xmr-stak
+# HowTo Use Xmr-Stak
 
 ## Content Overview
-* [Configuration](#configuration)
+* [Configurations](#configurations)
 * [Usage on Windows](#usage-on-windows)
-* [Usage on Linux](#usage-on-linux)
+* [Usage on Linux & macOS](#usage-on-linux--macos)
 * [Command Line Options](#command-line-options)
 * [Use different backends](#use-different-backends)
 * [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton)
@@ -77,6 +77,6 @@ Debug the docker image by getting inside:
 docker run --entrypoint=/bin/bash --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak
 ```
 
-## HTML and JSON API report configuraton
+## HTML and JSON API report configuration
 
 To configure the reports shown on the [README](../README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
index ccb836e41..2a60c46d9 100644
--- a/xmrstak/backend/amd/OclCryptonightR_gen.cpp
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
@@ -1,19 +1,18 @@
-#include <string>
-#include <sstream>
-#include <mutex>
 #include <cstring>
+#include <mutex>
+#include <sstream>
+#include <string>
 #include <thread>
 
-
 #include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
 #include "xmrstak/backend/cpu/crypto/variant4_random_math.h"
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/cpputil/read_write_lock.h"
+#include "xmrstak/misc/console.hpp"
 
 #include <chrono>
-#include <thread>
 #include <iostream>
-
+#include <regex>
+#include <thread>
 
 namespace xmrstak
 {
@@ -22,16 +21,16 @@ namespace amd
 
 static std::string get_code(const V4_Instruction* code, int code_size)
 {
-    std::stringstream s;
+	std::stringstream s;
 
-	for (int i = 0; i < code_size; ++i)
+	for(int i = 0; i < code_size; ++i)
 	{
 		const V4_Instruction inst = code[i];
 
 		const uint32_t a = inst.dst_index;
 		const uint32_t b = inst.src_index;
 
-		switch (inst.opcode)
+		switch(inst.opcode)
 		{
 		case MUL:
 			s << 'r' << a << "*=r" << b << ';';
@@ -58,37 +57,39 @@ static std::string get_code(const V4_Instruction* code, int code_size)
 		s << '\n';
 	}
 
-    return s.str();
+	return s.str();
 }
 
 struct CacheEntry
 {
-    CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) :
-        algo(algo),
-        height(height),
-        deviceIdx(deviceIdx),
-        program(program)
-    {}
-
-    xmrstak_algo algo;
-    uint64_t height;
-    size_t deviceIdx;
-    cl_program program;
+	CacheEntry(xmrstak_algo algo, uint64_t height_offset, size_t deviceIdx, cl_program program) :
+		algo(algo),
+		height_offset(height_offset),
+		deviceIdx(deviceIdx),
+		program(program)
+	{
+	}
+
+	xmrstak_algo algo;
+	uint64_t height_offset;
+	size_t deviceIdx;
+	cl_program program;
 };
 
 struct BackgroundTaskBase
 {
-    virtual ~BackgroundTaskBase() {}
-    virtual void exec() = 0;
+	virtual ~BackgroundTaskBase() {}
+	virtual void exec() = 0;
 };
 
-template<typename T>
+template <typename T>
 struct BackgroundTask : public BackgroundTaskBase
 {
-    BackgroundTask(T&& func) : m_func(std::move(func)) {}
-    void exec() override { m_func(); }
+	BackgroundTask(T&& func) :
+		m_func(std::move(func)) {}
+	void exec() override { m_func(); }
 
-    T m_func;
+	T m_func;
 };
 
 static ::cpputil::RWLock CryptonightR_cache_mutex;
@@ -99,94 +100,113 @@ static std::mutex background_tasks_mutex;
 static std::vector<BackgroundTaskBase*> background_tasks;
 static std::thread* background_thread = nullptr;
 
+static cl_program search_program(
+	const GpuContext* ctx,
+	xmrstak_algo algo,
+	uint64_t height_offset,
+	bool lock_cache = true)
+{
+	if(lock_cache)
+		CryptonightR_cache_mutex.ReadLock();
+
+	// Check if the cache has this program
+	for(const CacheEntry& entry : CryptonightR_cache)
+	{
+		if((entry.algo == algo) && (entry.height_offset == height_offset) && (entry.deviceIdx == ctx->deviceIdx))
+		{
+			printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu found in cache", height_offset);
+			auto result = entry.program;
+			if(lock_cache)
+				CryptonightR_cache_mutex.UnLock();
+			return result;
+		}
+	}
+	if(lock_cache)
+		CryptonightR_cache_mutex.UnLock();
+
+	return nullptr;
+}
+
 static void background_thread_proc()
 {
-    std::vector<BackgroundTaskBase*> tasks;
-    for (;;) {
-        tasks.clear();
-        {
-            std::lock_guard<std::mutex> g(background_tasks_mutex);
-            background_tasks.swap(tasks);
-        }
-
-        for (BackgroundTaskBase* task : tasks) {
-            task->exec();
-            delete task;
-        }
+	std::vector<BackgroundTaskBase*> tasks;
+	for(;;)
+	{
+		tasks.clear();
+		{
+			std::lock_guard<std::mutex> g(background_tasks_mutex);
+			background_tasks.swap(tasks);
+		}
+
+		for(BackgroundTaskBase* task : tasks)
+		{
+			task->exec();
+			delete task;
+		}
 
 		std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    }
+	}
 }
 
-template<typename T>
+template <typename T>
 static void background_exec(T&& func)
 {
-    BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
+	BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
 
-    std::lock_guard<std::mutex> g(background_tasks_mutex);
-    background_tasks.push_back(task);
-    if (!background_thread) {
-        background_thread = new std::thread(background_thread_proc);
-    }
+	std::lock_guard<std::mutex> g(background_tasks_mutex);
+	background_tasks.push_back(task);
+	if(!background_thread)
+	{
+		background_thread = new std::thread(background_thread_proc);
+	}
 }
 
 static cl_program CryptonightR_build_program(
-    const GpuContext* ctx,
-    xmrstak_algo algo,
-    uint64_t height,
-    uint32_t precompile_count,
-    std::string source_code,
-    std::string options)
+	const GpuContext* ctx,
+	xmrstak_algo algo,
+	uint64_t height_offset,
+	uint64_t height_chunk_size,
+	uint32_t precompile_count,
+	std::string source_code,
+	std::string options)
 {
-    std::vector<cl_program> old_programs;
-    old_programs.reserve(32);
-    {
+	std::vector<cl_program> old_programs;
+	old_programs.reserve(32);
+	{
 		CryptonightR_cache_mutex.WriteLock();
 
-        // Remove old programs from cache
-        for(size_t i = 0; i < CryptonightR_cache.size();)
-        {
-            const CacheEntry& entry = CryptonightR_cache[i];
-            if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
-                old_programs.push_back(entry.program);
-                CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
-                CryptonightR_cache.pop_back();
-            }
-            else
-            {
-                ++i;
-            }
-        }
+		// Remove old programs from cache
+		for(size_t i = 0; i < CryptonightR_cache.size();)
+		{
+			const CacheEntry& entry = CryptonightR_cache[i];
+			if((entry.algo == algo) && (entry.height_offset + (2 + precompile_count) * height_chunk_size < height_offset))
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu released (old program)", entry.height_offset);
+				old_programs.push_back(entry.program);
+				CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
+				CryptonightR_cache.pop_back();
+			}
+			else
+			{
+				++i;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
-
-    for(cl_program p : old_programs) {
-        clReleaseProgram(p);
-    }
+	}
 
-    std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
+	for(cl_program p : old_programs)
+	{
+		clReleaseProgram(p);
+	}
 
-    cl_program program = nullptr;
-    {
-		CryptonightR_cache_mutex.ReadLock();
+	std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
 
-        // Check if the cache already has this program (some other thread might have added it first)
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
-            {
-                program = entry.program;
-                break;
-            }
-        }
-		CryptonightR_cache_mutex.UnLock();
-    }
+	cl_program program = search_program(ctx, algo, height_offset);
 
-    if (program) {
-        return program;
-    }
+	if(program)
+	{
+		return program;
+	}
 
 	cl_int ret;
 	const char* source = source_code.c_str();
@@ -194,7 +214,7 @@ static cl_program CryptonightR_build_program(
 	program = clCreateProgramWithSource(ctx->opencl_ctx, 1, (const char**)&source, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L0,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
+		printer::inst()->print_msg(L0, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
 		return program;
 	}
 
@@ -202,11 +222,11 @@ static cl_program CryptonightR_build_program(
 	if(ret != CL_SUCCESS)
 	{
 		size_t len;
-		printer::inst()->print_msg(L0,"Error %s when calling clBuildProgram.", err_to_str(ret));
+		printer::inst()->print_msg(L0, "Error %s when calling clBuildProgram.", err_to_str(ret));
 
 		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+			printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
 			return program;
 		}
 
@@ -216,12 +236,12 @@ static cl_program CryptonightR_build_program(
 		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
 		{
 			free(BuildLog);
-			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+			printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
 			return program;
 		}
 
 		printer::inst()->print_str("Build log:\n");
-		std::cerr<<BuildLog<<std::endl;
+		std::cerr << BuildLog << std::endl;
 
 		free(BuildLog);
 		return program;
@@ -232,61 +252,89 @@ static cl_program CryptonightR_build_program(
 	{
 		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+			printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
 			return program;
 		}
 		std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-	}
-	while(status == CL_BUILD_IN_PROGRESS);
+	} while(status == CL_BUILD_IN_PROGRESS);
 
+	CryptonightR_cache_mutex.WriteLock();
+	auto cached_program = search_program(ctx, algo, height_offset, false);
 
-    printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
+	if(cached_program)
+	{
+		printer::inst()->print_msg(LDEBUG, "CryptonightR: release already existing program %llu", height_offset);
+		clReleaseProgram(program);
+		program = cached_program;
+	}
+	else
+	{
+		CryptonightR_cache.emplace_back(algo, height_offset, ctx->deviceIdx, program);
+		printer::inst()->print_msg(LDEBUG, "CryptonightR: cache compiled program for height_offset %llu", height_offset);
+	}
 
-	CryptonightR_cache_mutex.WriteLock();
-	CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program);
 	CryptonightR_cache_mutex.UnLock();
-    return program;
+	return program;
 }
 
-cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background)
+cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background)
 {
-	printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);
-
-    if (background) {
-        background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false); });
-        return nullptr;
-    }
-
-    const char* source_code_template =
-        #include "amd_gpu/opencl/wolf-aes.cl"
-        #include "amd_gpu/opencl/cryptonight_r.cl"
-    ;
-    const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
-    const char* offset = strstr(source_code_template, include_name);
-    if (!offset)
-    {
-        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo);
-        return nullptr;
-    }
-
-    V4_Instruction code[256];
-    int code_size;
-    switch (algo.Id())
-    {
-    case cryptonight_r_wow:
-        code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
-        break;
-    case cryptonight_r:
-        code_size = v4_random_math_init<cryptonight_r>(code, height);
-        break;
-    default:
-        printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
-        return nullptr;
-    }
-
-    std::string source_code(source_code_template, offset);
-    source_code.append(get_code(code, code_size));
-    source_code.append(offset + sizeof(include_name) - 1);
+	if(background)
+	{
+		background_exec([=]() { CryptonightR_get_program(ctx, algo, height_offset, height_chunk_size, precompile_count, false); });
+		return nullptr;
+	}
+
+	auto program = search_program(ctx, algo, height_offset);
+
+	if(program != nullptr)
+		return program;
+
+	printer::inst()->print_msg(LDEBUG, "CryptonightR: create code for block %llu to %llu", height_offset, height_offset + height_chunk_size);
+
+	const char* source_code_definitions =
+#include "amd_gpu/opencl/cryptonight_r_def.rtcl"
+#include "amd_gpu/opencl/wolf-aes.cl"
+		;
+
+	const char* source_code_template =
+#include "amd_gpu/opencl/cryptonight_r.rtcl"
+		;
+	const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
+	const char* offset = strstr(source_code_template, include_name);
+	if(!offset)
+	{
+		printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo);
+		return nullptr;
+	}
+
+	std::string source_code(source_code_definitions);
+
+	for(uint64_t c = 0; c < height_chunk_size; ++c)
+	{
+		V4_Instruction code[256];
+		int code_size;
+		switch(algo.Id())
+		{
+		case cryptonight_r_wow:
+			code_size = v4_random_math_init<cryptonight_r_wow>(code, height_offset + c);
+			break;
+		case cryptonight_r:
+			code_size = v4_random_math_init<cryptonight_r>(code, height_offset + c);
+			break;
+		default:
+			printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
+			return nullptr;
+		}
+
+		std::string kernel_code(source_code_template, offset);
+		kernel_code.append(get_code(code, code_size));
+		kernel_code.append(offset + sizeof(include_name) - 1);
+
+		std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height_offset + c);
+
+		source_code += std::regex_replace(kernel_code, std::regex("cn1_cryptonight_r"), kernel_name);
+	}
 
 	// scratchpad size for the selected mining algorithm
 	size_t hashMemSize = algo.Mem();
@@ -324,28 +372,12 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
 	if(algo == cryptonight_gpu)
 		options += " -cl-fp32-correctly-rounded-divide-sqrt";
 
+	program = search_program(ctx, algo, height_offset);
 
-    const char* source = source_code.c_str();
-
-    {
-		CryptonightR_cache_mutex.ReadLock();
-
-        // Check if the cache has this program
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
-				auto result = entry.program;
-				CryptonightR_cache_mutex.UnLock();
-                return result;
-            }
-        }
-		CryptonightR_cache_mutex.UnLock();
-
-    }
+	if(program != nullptr)
+		return program;
 
-    return CryptonightR_build_program(ctx, algo, height, precompile_count, source, options);
+	return CryptonightR_build_program(ctx, algo, height_offset, precompile_count, height_chunk_size, source_code, options);
 }
 
 } // namespace amd
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
index 7dce77b85..f8772b1f5 100644
--- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
@@ -3,8 +3,8 @@
 #include "xmrstak/backend/cryptonight.hpp"
 
 #include <stdint.h>
-#include <vector>
 #include <string>
+#include <vector>
 
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
@@ -20,7 +20,7 @@ namespace amd
 {
 
 cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
-	uint64_t height, uint32_t precompile_count, bool background = false);
+	uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background = false);
 
 } // namespace amd
 } // namespace xmrstak
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 9f3f75469..3c4384722 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -13,45 +13,43 @@
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
 
+#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
-#include "xmrstak/picosha2/picosha2.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 #include "xmrstak/params.hpp"
+#include "xmrstak/picosha2/picosha2.hpp"
 #include "xmrstak/version.hpp"
-#include "xmrstak/net/msgstruct.hpp"
-#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
 
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <math.h>
+#include <regex>
 #include <stdio.h>
 #include <string.h>
-#include <math.h>
-#include <iostream>
 #include <vector>
-#include <algorithm>
-#include <regex>
-#include <cassert>
-#include <algorithm>
 
 #include <fstream>
+#include <iostream>
 #include <sstream>
-#include <vector>
 #include <string>
-#include <iostream>
 #include <thread>
+#include <vector>
 
 #if defined _MSC_VER
 #include <direct.h>
 #elif defined __GNUC__
-#include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #endif
 
-
 #ifdef _WIN32
 #include <windows.h>
 
 static inline void create_directory(std::string dirname)
 {
-    _mkdir(dirname.data());
+	_mkdir(dirname.data());
 }
 
 static inline void port_sleep(size_t sec)
@@ -59,8 +57,8 @@ static inline void port_sleep(size_t sec)
 	Sleep(sec * 1000);
 }
 #else
-#include <unistd.h>
 #include <pwd.h>
+#include <unistd.h>
 
 static inline void create_directory(std::string dirname)
 {
@@ -100,7 +98,7 @@ char* LoadTextFile(const char* filename)
 	flen = ftell(kernel);
 	fseek(kernel, 0, SEEK_SET);
 
-	out = (char*)malloc(flen+1);
+	out = (char*)malloc(flen + 1);
 	size_t r = fread(out, flen, 1, kernel);
 	fclose(kernel);
 
@@ -121,7 +119,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaximumWorkSize, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -140,16 +138,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 */
 		MaximumWorkSize /= 8;
 	}
-	printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+	printer::inst()->print_msg(L1, "Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
 
 	if(ctx->workSize > MaximumWorkSize)
 	{
 		ctx->workSize = MaximumWorkSize;
-		printer::inst()->print_msg(L1,"Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+		printer::inst()->print_msg(L1, "Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
 	}
 
 	const std::string backendName = xmrstak::params::inst().openCLVendor;
-	if( (ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0)
+	if((ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0)
 	{
 		size_t reduced_intensity = (ctx->rawIntensity / ctx->workSize) * ctx->workSize;
 		ctx->rawIntensity = reduced_intensity;
@@ -157,29 +155,29 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 
 #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
-	const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 };
+	const cl_queue_properties CommandQueueProperties[] = {0, 0, 0};
 	ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
 #else
-	const cl_command_queue_properties CommandQueueProperties = { 0 };
+	const cl_command_queue_properties CommandQueueProperties = {0};
 	ctx->CommandQueues = clCreateCommandQueue(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
 #endif
 
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx);
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx);
 		return ERR_OCL_API;
 	}
 
 	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -193,14 +191,14 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -208,7 +206,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[2] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -216,7 +214,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[3] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -224,7 +222,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[4] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -232,7 +230,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[5] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -240,21 +238,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->OutputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * 0x100, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	std::vector<char> devNameVec(1024);
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret),ctx->deviceIdx );
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret), ctx->deviceIdx);
 		return ERR_OCL_API;
 	}
 
 	std::vector<char> openCLDriverVer(1024);
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx );
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret), ctx->deviceIdx);
 		return ERR_OCL_API;
 	}
 
@@ -342,11 +340,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		if(xmrstak::params::inst().AMDCache == false || !clBinFile.good())
 		{
 			if(xmrstak::params::inst().AMDCache)
-				printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
+				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code %s not found. Compiling ...", ctx->deviceIdx, cache_file.c_str());
 			ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
+				printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
 				return ERR_OCL_API;
 			}
 
@@ -354,11 +352,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			if(ret != CL_SUCCESS)
 			{
 				size_t len;
-				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+				printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram.", err_to_str(ret));
 
 				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
 				{
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 
@@ -368,28 +366,27 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
 				{
 					free(BuildLog);
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 
 				printer::inst()->print_str("Build log:\n");
-				std::cerr<<BuildLog<<std::endl;
+				std::cerr << BuildLog << std::endl;
 
 				free(BuildLog);
 				return ERR_OCL_API;
 			}
 
 			cl_uint num_devices;
-			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
-
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
 
 			std::vector<cl_device_id> devices_ids(num_devices);
-			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id) * devices_ids.size(), devices_ids.data(), NULL);
 			int dev_id = 0;
 			/* Search for the gpu within the program context.
 			 * The id can be different to  ctx->DeviceID.
 			 */
-			for(auto & ocl_device : devices_ids)
+			for(auto& ocl_device : devices_ids)
 			{
 				if(ocl_device == ctx->DeviceID)
 					break;
@@ -401,17 +398,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			{
 				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
 				{
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 				port_sleep(1);
-			}
-			while(status == CL_BUILD_IN_PROGRESS);
+			} while(status == CL_BUILD_IN_PROGRESS);
 
 			if(xmrstak::params::inst().AMDCache)
 			{
 				std::vector<size_t> binary_sizes(num_devices);
-				clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+				clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
 
 				std::vector<char*> all_programs(num_devices);
 				std::vector<std::vector<char>> program_storage;
@@ -419,7 +415,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				int p_id = 0;
 				size_t mem_size = 0;
 				// create memory  structure to query all OpenCL program binaries
-				for(auto & p : all_programs)
+				for(auto& p : all_programs)
 				{
 					program_storage.emplace_back(std::vector<char>(binary_sizes[p_id]));
 					all_programs[p_id] = program_storage[p_id].data();
@@ -427,9 +423,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 					p_id++;
 				}
 
-				if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(), NULL)) != CL_SUCCESS)
 				{
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramInfo.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 
@@ -437,12 +433,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary);
 				file_stream.write(all_programs[dev_id], binary_sizes[dev_id]);
 				file_stream.close();
-				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
+				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s", ctx->deviceIdx, cache_file.c_str());
 			}
 		}
 		else
 		{
-			printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
+			printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s", ctx->deviceIdx, cache_file.c_str());
 			std::ostringstream ss;
 			ss << clBinFile.rdbuf();
 			std::string s = ss.str();
@@ -453,22 +449,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			cl_int clStatus;
 			ctx->Program[miner_algo] = clCreateProgramWithBinary(
 				opencl_ctx, 1, &ctx->DeviceID, &bin_size,
-				(const unsigned char **)&data_ptr, &clStatus, &ret
-			);
+				(const unsigned char**)&data_ptr, &clStatus, &ret);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+				printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
 				return ERR_OCL_API;
 			}
 			ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+				printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
 				return ERR_OCL_API;
 			}
 		}
 
-		std::vector<std::string> KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" };
+		std::vector<std::string> KernelNames = {"cn2", "Blake", "Groestl", "JH", "Skein"};
 		if(miner_algo == cryptonight_gpu)
 		{
 			KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu");
@@ -494,7 +489,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
+				printer::inst()->print_msg(L1, "Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
 				return ERR_OCL_API;
 			}
 		}
@@ -508,30 +503,28 @@ const cl_platform_info attributeTypes[5] = {
 	CL_PLATFORM_VENDOR,
 	CL_PLATFORM_VERSION,
 	CL_PLATFORM_PROFILE,
-	CL_PLATFORM_EXTENSIONS
-};
+	CL_PLATFORM_EXTENSIONS};
 
 const char* const attributeNames[] = {
 	"CL_PLATFORM_NAME",
 	"CL_PLATFORM_VENDOR",
 	"CL_PLATFORM_VERSION",
 	"CL_PLATFORM_PROFILE",
-	"CL_PLATFORM_EXTENSIONS"
-};
+	"CL_PLATFORM_EXTENSIONS"};
 
-#define NELEMS(x)  (sizeof(x) / sizeof((x)[0]))
+#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
 
 uint32_t getNumPlatforms()
 {
 	cl_uint num_platforms = 0;
-	cl_platform_id * platforms = NULL;
+	cl_platform_id* platforms = NULL;
 	cl_int clStatus;
 
 	// Get platform and device information
 	clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
 	if(clStatus != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus));
 		return 0u;
 	}
 
@@ -554,29 +547,29 @@ std::vector<GpuContext> getAMDDevices(int index)
 	platforms.resize(numPlatforms);
 	if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
 		return ctxVec;
 	}
 
-	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
+	if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
 		return ctxVec;
 	}
 
 	device_list.resize(num_devices);
-	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
+	if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
 		return ctxVec;
 	}
 
-	for (size_t k = 0; k < num_devices; k++)
+	for(size_t k = 0; k < num_devices; k++)
 	{
 		std::vector<char> devVendorVec(1024);
 		if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
+			printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
 			continue;
 		}
 
@@ -596,19 +589,19 @@ std::vector<GpuContext> getAMDDevices(int index)
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
@@ -618,14 +611,14 @@ std::vector<GpuContext> getAMDDevices(int index)
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
 			std::vector<char> openCLDriverVer(1024);
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
@@ -636,7 +629,7 @@ std::vector<GpuContext> getAMDDevices(int index)
 			ctx.name = std::string(devNameVec.data());
 			ctx.DeviceID = device_list[k];
 			ctx.interleave = 40;
-			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
+			printer::inst()->print_msg(L0, "Found OpenCL GPU %s.", ctx.name.c_str());
 			ctxVec.push_back(ctx);
 		}
 	}
@@ -651,13 +644,13 @@ int getAMDPlatformIdx()
 
 	if(numPlatforms == 0)
 	{
-		printer::inst()->print_msg(L0,"WARNING: No OpenCL platform found.");
+		printer::inst()->print_msg(L0, "WARNING: No OpenCL platform found.");
 		return -1;
 	}
-	cl_platform_id * platforms = NULL;
+	cl_platform_id* platforms = NULL;
 	cl_int clStatus;
 
-	platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
+	platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms);
 	clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
 
 	int platformIndex = -1;
@@ -666,7 +659,8 @@ int getAMDPlatformIdx()
 
 	if(clStatus == CL_SUCCESS)
 	{
-		for (int i = 0; i < numPlatforms; i++) {
+		for(int i = 0; i < numPlatforms; i++)
+		{
 			size_t infoSize;
 			clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &infoSize);
 			std::vector<char> platformNameVec(infoSize);
@@ -675,13 +669,13 @@ int getAMDPlatformIdx()
 			std::string platformName(platformNameVec.data());
 
 			bool isAMDOpenCL = platformName.find("Advanced Micro Devices") != std::string::npos ||
-				platformName.find("Apple") != std::string::npos ||
-				platformName.find("Mesa") != std::string::npos;
+							   platformName.find("Apple") != std::string::npos ||
+							   platformName.find("Mesa") != std::string::npos;
 			bool isNVIDIADevice = platformName.find("NVIDIA Corporation") != std::string::npos || platformName.find("NVIDIA") != std::string::npos;
 			std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor;
 			if((isAMDOpenCL && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA"))
 			{
-				printer::inst()->print_msg(L0,"Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i , platformName.c_str());
+				printer::inst()->print_msg(L0, "Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i, platformName.c_str());
 				if(platformName.find("Mesa") != std::string::npos)
 					mesaPlatform = i;
 				else
@@ -695,12 +689,12 @@ int getAMDPlatformIdx()
 		// fall back to Mesa OpenCL
 		if(platformIndex == -1 && mesaPlatform != -1)
 		{
-			printer::inst()->print_msg(L0,"No AMD platform found select Mesa as OpenCL platform");
+			printer::inst()->print_msg(L0, "No AMD platform found select Mesa as OpenCL platform");
 			platformIndex = mesaPlatform;
 		}
 	}
 	else
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
 
 	free(platforms);
 	return platformIndex;
@@ -716,15 +710,14 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	if((ret = clGetPlatformIDs(0, NULL, &entries)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-
 	// The number of platforms naturally is the index of the last platform plus one.
 	if(entries <= platform_idx)
 	{
-		printer::inst()->print_msg(L1,"Selected OpenCL platform index %d doesn't exist.", platform_idx);
+		printer::inst()->print_msg(L1, "Selected OpenCL platform index %d doesn't exist.", platform_idx);
 		return ERR_STUPID_PARAMS;
 	}
 
@@ -736,7 +729,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 #endif
 	if((ret = clGetPlatformIDs(entries, PlatformIDList, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -747,12 +740,12 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	std::string platformName(platformNameVec.data());
 	if(xmrstak::params::inst().openCLVendor == "AMD" && platformName.find("Advanced Micro Devices") == std::string::npos)
 	{
-		printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str());
+		printer::inst()->print_msg(L1, "WARNING: using non AMD device: %s", platformName.c_str());
 	}
 
 	if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &entries)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -761,7 +754,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	{
 		if(ctx[i].deviceIdx >= entries)
 		{
-			printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
+			printer::inst()->print_msg(L1, "Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
 			return ERR_STUPID_PARAMS;
 		}
 	}
@@ -773,7 +766,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 #endif
 	if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, entries, DeviceIDList, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -790,41 +783,41 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	cl_context opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList.data(), NULL, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateContext.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	const char *fastIntMathV2CL =
-			#include "./opencl/fast_int_math_v2.cl"
-	;
-    const char *fastDivHeavyCL =
-        #include "./opencl/fast_div_heavy.cl"
-    ;
-	const char *cryptonightCL =
-			#include "./opencl/cryptonight.cl"
-	;
-	const char *blake256CL =
-			#include "./opencl/blake256.cl"
-	;
-	const char *groestl256CL =
-			#include "./opencl/groestl256.cl"
-	;
-	const char *jhCL =
-			#include "./opencl/jh.cl"
-	;
-	const char *wolfAesCL =
-			#include "./opencl/wolf-aes.cl"
-	;
-	const char *wolfSkeinCL =
-			#include "./opencl/wolf-skein.cl"
-	;
-	const char *cryptonight_gpu =
-			#include "./opencl/cryptonight_gpu.cl"
-	;
+	const char* fastIntMathV2CL =
+#include "./opencl/fast_int_math_v2.cl"
+		;
+	const char* fastDivHeavyCL =
+#include "./opencl/fast_div_heavy.cl"
+		;
+	const char* cryptonightCL =
+#include "./opencl/cryptonight.cl"
+		;
+	const char* blake256CL =
+#include "./opencl/blake256.cl"
+		;
+	const char* groestl256CL =
+#include "./opencl/groestl256.cl"
+		;
+	const char* jhCL =
+#include "./opencl/jh.cl"
+		;
+	const char* wolfAesCL =
+#include "./opencl/wolf-aes.cl"
+		;
+	const char* wolfSkeinCL =
+#include "./opencl/wolf-skein.cl"
+		;
+	const char* cryptonight_gpu =
+#include "./opencl/cryptonight_gpu.cl"
+		;
 
 	std::string source_code(cryptonightCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
-    source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
@@ -840,7 +833,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	for(int i = 0; i < num_gpus; ++i)
 	{
-		printer::inst()->print_msg(LDEBUG,"OpenCL Init device %d", ctx[i].deviceIdx);
+		printer::inst()->print_msg(LDEBUG, "OpenCL Init device %d", ctx[i].deviceIdx);
 		const size_t devIdx = ctx[i].deviceIdx;
 		if(interleaveData.size() <= devIdx)
 		{
@@ -850,12 +843,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 		{
 			interleaveData[devIdx].reset(new InterleaveData{});
 			interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms();
-
 		}
-		ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU;
+		ctx[i].idWorkerOnDevice = interleaveData[devIdx]->numThreadsOnGPU;
 		++interleaveData[devIdx]->numThreadsOnGPU;
 		ctx[i].interleaveData = interleaveData[devIdx];
-		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
+		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave) / 100.0;
 		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
 		ctx[i].opencl_ctx = opencl_ctx;
 
@@ -871,7 +863,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height)
 {
 
-	auto & Kernels = ctx->Kernels[miner_algo.Id()];
+	auto& Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 
@@ -885,35 +877,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Scratchpads
 	if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
 	if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
 	if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
-		return(ERR_OCL_API);
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
+		return (ERR_OCL_API);
 	}
 
 	if(miner_algo == cryptonight_gpu)
@@ -922,80 +914,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		// Scratchpads
 		if((ret = clSetKernelArg(Kernels[7], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// States
 		if((ret = clSetKernelArg(Kernels[7], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 	}
 
-    // CN1 Kernel
+	// CN1 Kernel
 
-    if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {
+	if((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow))
+	{
 
-		uint32_t PRECOMPILATION_DEPTH = 4;
+		uint32_t PRECOMPILATION_DEPTH = 1;
+		constexpr uint64_t height_chunk_size = 25;
+		uint64_t height_offset = (height / height_chunk_size) * height_chunk_size;
 
-        // Get new kernel
-        cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH);
+		// Get new kernel
+		cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset, height_chunk_size, PRECOMPILATION_DEPTH);
 
-        if (program != ctx->ProgramCryptonightR) {
-            cl_int ret;
-            cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret);
+		if(program != ctx->ProgramCryptonightR || ctx->last_block_height != height)
+		{
+			cl_int ret;
+			std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height);
+			cl_kernel kernel = clCreateKernel(program, kernel_name.c_str(), &ret);
 
-            if (ret != CL_SUCCESS) {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret));
-            }
-            else
+			if(ret != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret));
+			}
+			else
 			{
-                cl_kernel old_kernel = Kernels[1];
+				cl_kernel old_kernel = Kernels[1];
 				if(old_kernel)
 					clReleaseKernel(old_kernel);
-                Kernels[1] = kernel;
-            }
-            ctx->ProgramCryptonightR = program;
+				Kernels[1] = kernel;
+			}
+			ctx->ProgramCryptonightR = program;
+			ctx->last_block_height = height;
+			printer::inst()->print_msg(LDEBUG, "Set height %llu", height);
 
-            // Precompile next program in background
-            for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
-                xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true);
+			// Precompile next program in background
+			for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
+				xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset + i * height_chunk_size, height_chunk_size, PRECOMPILATION_DEPTH, true);
 
-            printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
-        }
+			printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
+		}
 		else
 		{
 			printer::inst()->print_msg(LDEBUG, "Thread #%zu found CryptonightR", ctx->deviceIdx);
 		}
-    }
+	}
 
 	// Scratchpads
 	if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
 	if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
 	if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
-		return(ERR_OCL_API);
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
+		return (ERR_OCL_API);
 	}
 
 	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2)
 	{
 		// Input
-		if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret));
 			return ERR_OCL_API;
@@ -1006,14 +1006,14 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	// Scratchpads
 	if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
 	if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -1022,59 +1022,59 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		// Output
 		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2);
 			return ERR_OCL_API;
 		}
 
 		// Target
 		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3);
 			return ERR_OCL_API;
 		}
 
 		// Threads
 		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
-			return(ERR_OCL_API);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
+			return (ERR_OCL_API);
 		}
 	}
 	else
-		{
+	{
 		// Branch 0
 		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Branch 1
 		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Branch 2
 		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Branch 3
 		if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Threads
 		if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
-			return(ERR_OCL_API);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
+			return (ERR_OCL_API);
 		}
 
 		for(int i = 0; i < 4; ++i)
@@ -1082,35 +1082,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 			// States
 			if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
 				return ERR_OCL_API;
 			}
 
 			// Nonce buffer
 			if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
 				return ERR_OCL_API;
 			}
 
 			// Output
 			if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
 				return ERR_OCL_API;
 			}
 
 			// Target
 			if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 				return ERR_OCL_API;
 			}
 
 			if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
-				return(ERR_OCL_API);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+				return (ERR_OCL_API);
 			}
 		}
 	}
@@ -1134,7 +1134,7 @@ uint64_t updateTimings(GpuContext* ctx, const uint64_t t)
 		if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0)
 			ctx->interleaveData->avgKernelRuntime = runtime;
 		else
-			ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias;
+			ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime)*averagingBias;
 	}
 	return runtime;
 }
@@ -1163,7 +1163,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 
 		if((dt > 0) && (dt < optimalTimeOffset))
 		{
-			delay = static_cast<int64_t>((optimalTimeOffset  - dt));
+			delay = static_cast<int64_t>((optimalTimeOffset - dt));
 
 			if(enableAutoAdjustment)
 			{
@@ -1182,8 +1182,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 			// avoid that the auto adjustment is disable interleaving
 			ctx->interleaveData->adjustThreshold = std::max(
 				ctx->interleaveData->adjustThreshold,
-				0.001
-			);
+				0.001);
 		}
 		delay = std::max(int64_t(0), delay);
 
@@ -1194,13 +1193,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 		{
 			// do not notify the user anymore if we reach a good delay
 			if(delay > maxDelay)
-				printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
+				printer::inst()->print_msg(L1, "OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
 					ctx->deviceIdx,
 					ctx->idWorkerOnDevice,
 					static_cast<uint32_t>(delay),
 					avgRuntime,
-					ctx->interleaveData->adjustThreshold * 100.
-				);
+					ctx->interleaveData->adjustThreshold * 100.);
 
 			std::this_thread::sleep_for(std::chrono::milliseconds(delay));
 		}
@@ -1211,12 +1209,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo)
 {
-	const auto & Kernels = ctx->Kernels[miner_algo.Id()];
+	const auto& Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 	cl_uint zero = 0;
 	size_t BranchNonces[4];
-	memset(BranchNonces,0,sizeof(size_t)*4);
+	memset(BranchNonces, 0, sizeof(size_t) * 4);
 
 	size_t g_intensity = ctx->rawIntensity;
 	size_t w_size = ctx->workSize;
@@ -1227,28 +1225,28 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 		// round up to next multiple of w_size
 		g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
 		// number of global threads must be a multiple of the work group size (w_size)
-		assert(g_thd%w_size == 0);
+		assert(g_thd % w_size == 0);
 	}
 
 	for(int i = 2; i < 6; ++i)
 	{
 		if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2);
 			return ERR_OCL_API;
 		}
 	}
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
+	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = {g_thd, 8}, lthreads[2] = {8, 8};
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
 		return ERR_OCL_API;
 	}
 
@@ -1260,7 +1258,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 		size_t intens = g_intensity * thd;
 		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[7], 1, 0, &intens, &thd, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7);
 			return ERR_OCL_API;
 		}
 
@@ -1269,7 +1267,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 
 		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 			return ERR_OCL_API;
 		}
 	}
@@ -1277,25 +1275,25 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 	{
 		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 			return ERR_OCL_API;
 		}
 	}
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	size_t  NonceT[2] = {0, ctx->Nonce}, gthreadsT[2] = {8, g_thd}, lthreadsT[2] = {8 , w_size};
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, NonceT, gthreadsT, lthreadsT, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
-		return ERR_OCL_API;
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
+			return ERR_OCL_API;
 	}
 
 	if(miner_algo != cryptonight_gpu)
 	{
 		for(int i = 0; i < 4; ++i)
 		{
-			size_t tmpNonce = ctx->Nonce;
 			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+				printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
 				return ERR_OCL_API;
 			}
 		}
@@ -1304,11 +1302,11 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 	// this call is blocking therefore the access to the results without cl_finish is fine
 	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	auto & numHashValues = HashOutput[0xFF];
+	auto& numHashValues = HashOutput[0xFF];
 	// avoid out of memory read, we have only storage for 0xFF results
 	if(numHashValues > 0xFF)
 		numHashValues = 0xFF;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index ae2b506db..1ba300c7a 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/console.hpp"
 
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
@@ -9,13 +9,13 @@
 #include <CL/cl.h>
 #endif
 
+#include <array>
+#include <map>
+#include <memory>
+#include <mutex>
 #include <stdint.h>
 #include <string>
 #include <vector>
-#include <mutex>
-#include <memory>
-#include <map>
-#include <array>
 
 #define ERR_SUCCESS (0)
 #define ERR_OCL_API (2)
@@ -23,13 +23,13 @@
 
 struct InterleaveData
 {
-    std::mutex mutex;
+	std::mutex mutex;
 
-    double adjustThreshold = 0.4;
-    double startAdjustThreshold = 0.4;
-    double avgKernelRuntime = 0.0;
-    uint64_t lastRunTimeStamp = 0;
-    uint32_t numThreadsOnGPU = 0;
+	double adjustThreshold = 0.4;
+	double startAdjustThreshold = 0.4;
+	double avgKernelRuntime = 0.0;
+	uint64_t lastRunTimeStamp = 0;
+	uint32_t numThreadsOnGPU = 0;
 };
 
 struct GpuContext
@@ -54,8 +54,9 @@ struct GpuContext
 	cl_mem ExtraBuffers[6];
 	cl_context opencl_ctx = nullptr;
 	std::map<xmrstak_algo_id, cl_program> Program;
-	std::map<xmrstak_algo_id, std::array<cl_kernel,8>> Kernels;
+	std::map<xmrstak_algo_id, std::array<cl_kernel, 8>> Kernels;
 	cl_program ProgramCryptonightR = nullptr;
+	uint64_t last_block_height = 0u;
 	size_t freeMem;
 	size_t maxMemPerAlloc;
 	int computeUnits;
@@ -66,148 +67,147 @@ struct GpuContext
 	uint64_t lastDelay = 0;
 
 	uint32_t Nonce;
-
 };
 
 namespace
 {
-	const char* err_to_str(cl_int ret)
+const char* err_to_str(cl_int ret)
+{
+	switch(ret)
 	{
-		switch(ret)
-		{
-		case CL_SUCCESS:
-			return "CL_SUCCESS";
-		case CL_DEVICE_NOT_FOUND:
-			return "CL_DEVICE_NOT_FOUND";
-		case CL_DEVICE_NOT_AVAILABLE:
-			return "CL_DEVICE_NOT_AVAILABLE";
-		case CL_COMPILER_NOT_AVAILABLE:
-			return "CL_COMPILER_NOT_AVAILABLE";
-		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-			return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-		case CL_OUT_OF_RESOURCES:
-			return "CL_OUT_OF_RESOURCES";
-		case CL_OUT_OF_HOST_MEMORY:
-			return "CL_OUT_OF_HOST_MEMORY";
-		case CL_PROFILING_INFO_NOT_AVAILABLE:
-			return "CL_PROFILING_INFO_NOT_AVAILABLE";
-		case CL_MEM_COPY_OVERLAP:
-			return "CL_MEM_COPY_OVERLAP";
-		case CL_IMAGE_FORMAT_MISMATCH:
-			return "CL_IMAGE_FORMAT_MISMATCH";
-		case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-			return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-		case CL_BUILD_PROGRAM_FAILURE:
-			return "CL_BUILD_PROGRAM_FAILURE";
-		case CL_MAP_FAILURE:
-			return "CL_MAP_FAILURE";
-		case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-			return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-		case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-			return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-	#ifdef CL_VERSION_1_2
-		case CL_COMPILE_PROGRAM_FAILURE:
-			return "CL_COMPILE_PROGRAM_FAILURE";
-		case CL_LINKER_NOT_AVAILABLE:
-			return "CL_LINKER_NOT_AVAILABLE";
-		case CL_LINK_PROGRAM_FAILURE:
-			return "CL_LINK_PROGRAM_FAILURE";
-		case CL_DEVICE_PARTITION_FAILED:
-			return "CL_DEVICE_PARTITION_FAILED";
-		case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-			return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-	#endif
-		case CL_INVALID_VALUE:
-			return "CL_INVALID_VALUE";
-		case CL_INVALID_DEVICE_TYPE:
-			return "CL_INVALID_DEVICE_TYPE";
-		case CL_INVALID_PLATFORM:
-			return "CL_INVALID_PLATFORM";
-		case CL_INVALID_DEVICE:
-			return "CL_INVALID_DEVICE";
-		case CL_INVALID_CONTEXT:
-			return "CL_INVALID_CONTEXT";
-		case CL_INVALID_QUEUE_PROPERTIES:
-			return "CL_INVALID_QUEUE_PROPERTIES";
-		case CL_INVALID_COMMAND_QUEUE:
-			return "CL_INVALID_COMMAND_QUEUE";
-		case CL_INVALID_HOST_PTR:
-			return "CL_INVALID_HOST_PTR";
-		case CL_INVALID_MEM_OBJECT:
-			return "CL_INVALID_MEM_OBJECT";
-		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-			return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-		case CL_INVALID_IMAGE_SIZE:
-			return "CL_INVALID_IMAGE_SIZE";
-		case CL_INVALID_SAMPLER:
-			return "CL_INVALID_SAMPLER";
-		case CL_INVALID_BINARY:
-			return "CL_INVALID_BINARY";
-		case CL_INVALID_BUILD_OPTIONS:
-			return "CL_INVALID_BUILD_OPTIONS";
-		case CL_INVALID_PROGRAM:
-			return "CL_INVALID_PROGRAM";
-		case CL_INVALID_PROGRAM_EXECUTABLE:
-			return "CL_INVALID_PROGRAM_EXECUTABLE";
-		case CL_INVALID_KERNEL_NAME:
-			return "CL_INVALID_KERNEL_NAME";
-		case CL_INVALID_KERNEL_DEFINITION:
-			return "CL_INVALID_KERNEL_DEFINITION";
-		case CL_INVALID_KERNEL:
-			return "CL_INVALID_KERNEL";
-		case CL_INVALID_ARG_INDEX:
-			return "CL_INVALID_ARG_INDEX";
-		case CL_INVALID_ARG_VALUE:
-			return "CL_INVALID_ARG_VALUE";
-		case CL_INVALID_ARG_SIZE:
-			return "CL_INVALID_ARG_SIZE";
-		case CL_INVALID_KERNEL_ARGS:
-			return "CL_INVALID_KERNEL_ARGS";
-		case CL_INVALID_WORK_DIMENSION:
-			return "CL_INVALID_WORK_DIMENSION";
-		case CL_INVALID_WORK_GROUP_SIZE:
-			return "CL_INVALID_WORK_GROUP_SIZE";
-		case CL_INVALID_WORK_ITEM_SIZE:
-			return "CL_INVALID_WORK_ITEM_SIZE";
-		case CL_INVALID_GLOBAL_OFFSET:
-			return "CL_INVALID_GLOBAL_OFFSET";
-		case CL_INVALID_EVENT_WAIT_LIST:
-			return "CL_INVALID_EVENT_WAIT_LIST";
-		case CL_INVALID_EVENT:
-			return "CL_INVALID_EVENT";
-		case CL_INVALID_OPERATION:
-			return "CL_INVALID_OPERATION";
-		case CL_INVALID_GL_OBJECT:
-			return "CL_INVALID_GL_OBJECT";
-		case CL_INVALID_BUFFER_SIZE:
-			return "CL_INVALID_BUFFER_SIZE";
-		case CL_INVALID_MIP_LEVEL:
-			return "CL_INVALID_MIP_LEVEL";
-		case CL_INVALID_GLOBAL_WORK_SIZE:
-			return "CL_INVALID_GLOBAL_WORK_SIZE";
-		case CL_INVALID_PROPERTY:
-			return "CL_INVALID_PROPERTY";
-	#ifdef CL_VERSION_1_2
-		case CL_INVALID_IMAGE_DESCRIPTOR:
-			return "CL_INVALID_IMAGE_DESCRIPTOR";
-		case CL_INVALID_COMPILER_OPTIONS:
-			return "CL_INVALID_COMPILER_OPTIONS";
-		case CL_INVALID_LINKER_OPTIONS:
-			return "CL_INVALID_LINKER_OPTIONS";
-		case CL_INVALID_DEVICE_PARTITION_COUNT:
-			return "CL_INVALID_DEVICE_PARTITION_COUNT";
-	#endif
-	#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
-		case CL_INVALID_PIPE_SIZE:
-			return "CL_INVALID_PIPE_SIZE";
-		case CL_INVALID_DEVICE_QUEUE:
-			return "CL_INVALID_DEVICE_QUEUE";
-	#endif
-		default:
-			return "UNKNOWN_ERROR";
-		}
+	case CL_SUCCESS:
+		return "CL_SUCCESS";
+	case CL_DEVICE_NOT_FOUND:
+		return "CL_DEVICE_NOT_FOUND";
+	case CL_DEVICE_NOT_AVAILABLE:
+		return "CL_DEVICE_NOT_AVAILABLE";
+	case CL_COMPILER_NOT_AVAILABLE:
+		return "CL_COMPILER_NOT_AVAILABLE";
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+		return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+	case CL_OUT_OF_RESOURCES:
+		return "CL_OUT_OF_RESOURCES";
+	case CL_OUT_OF_HOST_MEMORY:
+		return "CL_OUT_OF_HOST_MEMORY";
+	case CL_PROFILING_INFO_NOT_AVAILABLE:
+		return "CL_PROFILING_INFO_NOT_AVAILABLE";
+	case CL_MEM_COPY_OVERLAP:
+		return "CL_MEM_COPY_OVERLAP";
+	case CL_IMAGE_FORMAT_MISMATCH:
+		return "CL_IMAGE_FORMAT_MISMATCH";
+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+		return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+	case CL_BUILD_PROGRAM_FAILURE:
+		return "CL_BUILD_PROGRAM_FAILURE";
+	case CL_MAP_FAILURE:
+		return "CL_MAP_FAILURE";
+	case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+		return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+	case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+		return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+#ifdef CL_VERSION_1_2
+	case CL_COMPILE_PROGRAM_FAILURE:
+		return "CL_COMPILE_PROGRAM_FAILURE";
+	case CL_LINKER_NOT_AVAILABLE:
+		return "CL_LINKER_NOT_AVAILABLE";
+	case CL_LINK_PROGRAM_FAILURE:
+		return "CL_LINK_PROGRAM_FAILURE";
+	case CL_DEVICE_PARTITION_FAILED:
+		return "CL_DEVICE_PARTITION_FAILED";
+	case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+		return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+#endif
+	case CL_INVALID_VALUE:
+		return "CL_INVALID_VALUE";
+	case CL_INVALID_DEVICE_TYPE:
+		return "CL_INVALID_DEVICE_TYPE";
+	case CL_INVALID_PLATFORM:
+		return "CL_INVALID_PLATFORM";
+	case CL_INVALID_DEVICE:
+		return "CL_INVALID_DEVICE";
+	case CL_INVALID_CONTEXT:
+		return "CL_INVALID_CONTEXT";
+	case CL_INVALID_QUEUE_PROPERTIES:
+		return "CL_INVALID_QUEUE_PROPERTIES";
+	case CL_INVALID_COMMAND_QUEUE:
+		return "CL_INVALID_COMMAND_QUEUE";
+	case CL_INVALID_HOST_PTR:
+		return "CL_INVALID_HOST_PTR";
+	case CL_INVALID_MEM_OBJECT:
+		return "CL_INVALID_MEM_OBJECT";
+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+		return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+	case CL_INVALID_IMAGE_SIZE:
+		return "CL_INVALID_IMAGE_SIZE";
+	case CL_INVALID_SAMPLER:
+		return "CL_INVALID_SAMPLER";
+	case CL_INVALID_BINARY:
+		return "CL_INVALID_BINARY";
+	case CL_INVALID_BUILD_OPTIONS:
+		return "CL_INVALID_BUILD_OPTIONS";
+	case CL_INVALID_PROGRAM:
+		return "CL_INVALID_PROGRAM";
+	case CL_INVALID_PROGRAM_EXECUTABLE:
+		return "CL_INVALID_PROGRAM_EXECUTABLE";
+	case CL_INVALID_KERNEL_NAME:
+		return "CL_INVALID_KERNEL_NAME";
+	case CL_INVALID_KERNEL_DEFINITION:
+		return "CL_INVALID_KERNEL_DEFINITION";
+	case CL_INVALID_KERNEL:
+		return "CL_INVALID_KERNEL";
+	case CL_INVALID_ARG_INDEX:
+		return "CL_INVALID_ARG_INDEX";
+	case CL_INVALID_ARG_VALUE:
+		return "CL_INVALID_ARG_VALUE";
+	case CL_INVALID_ARG_SIZE:
+		return "CL_INVALID_ARG_SIZE";
+	case CL_INVALID_KERNEL_ARGS:
+		return "CL_INVALID_KERNEL_ARGS";
+	case CL_INVALID_WORK_DIMENSION:
+		return "CL_INVALID_WORK_DIMENSION";
+	case CL_INVALID_WORK_GROUP_SIZE:
+		return "CL_INVALID_WORK_GROUP_SIZE";
+	case CL_INVALID_WORK_ITEM_SIZE:
+		return "CL_INVALID_WORK_ITEM_SIZE";
+	case CL_INVALID_GLOBAL_OFFSET:
+		return "CL_INVALID_GLOBAL_OFFSET";
+	case CL_INVALID_EVENT_WAIT_LIST:
+		return "CL_INVALID_EVENT_WAIT_LIST";
+	case CL_INVALID_EVENT:
+		return "CL_INVALID_EVENT";
+	case CL_INVALID_OPERATION:
+		return "CL_INVALID_OPERATION";
+	case CL_INVALID_GL_OBJECT:
+		return "CL_INVALID_GL_OBJECT";
+	case CL_INVALID_BUFFER_SIZE:
+		return "CL_INVALID_BUFFER_SIZE";
+	case CL_INVALID_MIP_LEVEL:
+		return "CL_INVALID_MIP_LEVEL";
+	case CL_INVALID_GLOBAL_WORK_SIZE:
+		return "CL_INVALID_GLOBAL_WORK_SIZE";
+	case CL_INVALID_PROPERTY:
+		return "CL_INVALID_PROPERTY";
+#ifdef CL_VERSION_1_2
+	case CL_INVALID_IMAGE_DESCRIPTOR:
+		return "CL_INVALID_IMAGE_DESCRIPTOR";
+	case CL_INVALID_COMPILER_OPTIONS:
+		return "CL_INVALID_COMPILER_OPTIONS";
+	case CL_INVALID_LINKER_OPTIONS:
+		return "CL_INVALID_LINKER_OPTIONS";
+	case CL_INVALID_DEVICE_PARTITION_COUNT:
+		return "CL_INVALID_DEVICE_PARTITION_COUNT";
+#endif
+#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
+	case CL_INVALID_PIPE_SIZE:
+		return "CL_INVALID_PIPE_SIZE";
+	case CL_INVALID_DEVICE_QUEUE:
+		return "CL_INVALID_DEVICE_QUEUE";
+#endif
+	default:
+		return "UNKNOWN_ERROR";
 	}
 }
+} // namespace
 
 uint32_t getNumPlatforms();
 int getAMDPlatformIdx();
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 12478aefb..d17b79215 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -198,7 +198,7 @@ inline void keccakf1600_1(ulong st[25])
 	}
 }
 )==="
-R"===(
+	R"===(
 
 void keccakf1600_2(__local ulong *st)
 {
@@ -372,7 +372,7 @@ inline int4 _mm_alignr_epi8(int4 a, const uint rot)
 #endif
 
 )==="
-R"===(
+	R"===(
 
 void CNKeccak(ulong *output, ulong *input)
 {
@@ -416,7 +416,7 @@ void AESExpandKey256(uint *keybuf)
 }
 
 )==="
-R"===(
+	R"===(
 
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
 
@@ -577,7 +577,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 }
 
 )==="
-R"===(
+	R"===(
 
 // __NV_CL_C_VERSION checks if NVIDIA opencl is used
 #if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
@@ -868,8 +868,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 )==="
 R"===(
+#if defined(__clang__)
+#	if __has_builtin(__builtin_amdgcn_ds_bpermute)
+#		define HAS_AMD_BPERMUTE  1
+#	endif
+#endif
 
-__attribute__((reqd_work_group_size(8, 8, 1)))
+__attribute__((reqd_work_group_size(8, WORKSIZE, 1)))
 __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states,
 
 #if (ALGO == cryptonight_gpu)
@@ -878,88 +883,123 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	__global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
 #endif
 {
-	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
-	uint ExpandedKey2[40];
-	uint4 text;
+    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+    uint ExpandedKey2[40];
+    uint4 text;
 
-	const uint gIdx = getIdx();
+    uint gIdx = get_global_id(1) - get_global_offset(1);
+    uint groupIdx = get_local_id(1);
+    uint lIdx = get_local_id(0);
 
-	for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) {
-		const uint tmp = AES0_C[i];
-		AES0[i] = tmp;
-		AES1[i] = rotate(tmp, 8U);
-		AES2[i] = rotate(tmp, 16U);
-		AES3[i] = rotate(tmp, 24U);
-	}
+    for (int i = groupIdx * 8 + lIdx; i < 256; i += get_local_size(0) * get_local_size(1)) {
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+        AES2[i] = rotate(tmp, 16U);
+        AES3[i] = rotate(tmp, 24U);
+    }
 
-	barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2  || ALGO == cryptonight_superfast)
-	__local uint4 xin1[8][8];
-	__local uint4 xin2[8][8];
+    __local uint4 xin1[WORKSIZE][8];
+    __local uint4 xin2[WORKSIZE][8];
 #endif
 
 #if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
+    // do not use early return here
+    if(gIdx < Threads)
 #endif
-	{
-		states += 25 * gIdx;
+    {
+        states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (MEMORY >> 4);
+        Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-		Scratchpad += gIdx;
+                Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+        Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
 #elif(STRIDED_INDEX==3)
-		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
+                Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
-		#if defined(__Tahiti__) || defined(__Pitcairn__)
+        #if defined(__Tahiti__) || defined(__Pitcairn__)
 
-		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
-		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+        for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
+        text = vload4(lIdx + 4, (__global uint *)states);
 
-		#else
+        #else
+        text = vload4(lIdx + 4, (__global uint *)states);
+        ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
 
-		text = vload4(get_local_id(1) + 4, (__global uint *)states);
-		((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
+        #endif
 
-		#endif
+        AESExpandKey256(ExpandedKey2);
+    }
 
-		AESExpandKey256(ExpandedKey2);
-	}
-
-	barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-	__local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
-	__local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
-	__local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
-	__local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)];
-	*xin2_store = (uint4)(0, 0, 0, 0);
+#	if (HAS_AMD_BPERMUTE == 1)
+	int lane = (groupIdx * 8 + ((lIdx + 1) % 8)) << 2;
+	uint4 tmp = (uint4)(0, 0, 0, 0);
+#	else
+    __local uint4* xin1_store = &xin1[groupIdx][lIdx];
+    __local uint4* xin1_load = &xin1[groupIdx][(lIdx + 1) % 8];
+    __local uint4* xin2_store = &xin2[groupIdx][lIdx];
+    __local uint4* xin2_load = &xin2[groupIdx][(lIdx + 1) % 8];
+    *xin2_store = (uint4)(0, 0, 0, 0);
+#	endif
 #endif
 
 #if(COMP_MODE == 1)
-	// do not use early return here
-	if (gIdx < Threads)
+    // do not use early return here
+    if (gIdx < Threads)
 #endif
-	{
+    {
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+
+#	if	(HAS_AMD_BPERMUTE == 1)
+        #pragma unroll 2
+        for(int i = 0, i1 = lIdx; i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
+        {
+            text ^= Scratchpad[IDX((uint)i1)];
+			text ^= tmp;
+
+            #pragma unroll 10
+            for(int j = 0; j < 10; ++j)
+                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+
+            text.s0 ^= __builtin_amdgcn_ds_bpermute(lane, text.s0);
+            text.s1 ^= __builtin_amdgcn_ds_bpermute(lane, text.s1);
+            text.s2 ^= __builtin_amdgcn_ds_bpermute(lane, text.s2);
+            text.s3 ^= __builtin_amdgcn_ds_bpermute(lane, text.s3);
+			//__builtin_amdgcn_s_waitcnt(0);
+            text ^= Scratchpad[IDX((uint)i1 + 8u)];
+
+            #pragma unroll 10
+            for(int j = 0; j < 10; ++j)
+                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+            tmp.s0 = __builtin_amdgcn_ds_bpermute(lane, text.s0);
+            tmp.s1 = __builtin_amdgcn_ds_bpermute(lane, text.s1);
+            tmp.s2 = __builtin_amdgcn_ds_bpermute(lane, text.s2);
+            tmp.s3 = __builtin_amdgcn_ds_bpermute(lane, text.s3);
+			//__builtin_amdgcn_s_waitcnt(0);
+        }
+
+        text ^= tmp;
+#	else
+
 		#pragma unroll 2
-		for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
+		for(int i = 0, i1 = lIdx; i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
 		{
 			text ^= Scratchpad[IDX((uint)i1)];
 			barrier(CLK_LOCAL_MEM_FENCE);
 			text ^= *xin2_load;
-
 			#pragma unroll 10
 			for(int j = 0; j < 10; ++j)
 			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-
 			*xin1_store = text;
-
 			text ^= Scratchpad[IDX((uint)i1 + 8u)];
 			barrier(CLK_LOCAL_MEM_FENCE);
 			text ^= *xin1_load;
@@ -971,87 +1011,96 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			*xin2_store = text;
 		}
 
-		barrier(CLK_LOCAL_MEM_FENCE);
-		text ^= *xin2_load;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        text ^= *xin2_load;
+#	endif
 
 #else
-		#pragma unroll 2
-		for (int i = 0; i < (MEMORY >> 7); ++i) {
-			text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))];
-
-			#pragma unroll 10
-			for(int j = 0; j < 10; ++j)
-			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-		}
+        #pragma unroll 2
+        for (int i = 0; i < (MEMORY >> 7); ++i)
+        {
+            text ^= Scratchpad[IDX((uint)((i << 3) + lIdx))];
+
+            #pragma unroll 10
+            for(int j = 0; j < 10; ++j)
+                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+        }
 #endif
-	}
+    }
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-	/* Also left over threads performe this loop.
-	 * The left over thread results will be ignored
-	 */
-	#pragma unroll 16
-	for(size_t i = 0; i < 16; i++)
-	{
-		#pragma unroll 10
-		for (int j = 0; j < 10; ++j) {
-			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-		}
-
-		barrier(CLK_LOCAL_MEM_FENCE);
-		*xin1_store = text;
-		barrier(CLK_LOCAL_MEM_FENCE);
-		text ^= *xin1_load;
-	}
+    /* Also left over threads performe this loop.
+     * The left over thread results will be ignored
+     */
+    #pragma unroll 16
+    for(size_t i = 0; i < 16; i++)
+    {
+        #pragma unroll 10
+        for (int j = 0; j < 10; ++j) {
+            text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+        }
+#if (HAS_AMD_BPERMUTE == 1)
+	    text.s0 ^= __builtin_amdgcn_ds_bpermute(lane, text.s0);
+        text.s1 ^= __builtin_amdgcn_ds_bpermute(lane, text.s1);
+        text.s2 ^= __builtin_amdgcn_ds_bpermute(lane, text.s2);
+        text.s3 ^= __builtin_amdgcn_ds_bpermute(lane, text.s3);
+		//__builtin_amdgcn_s_waitcnt(0);
+#else
+        barrier(CLK_LOCAL_MEM_FENCE);
+        *xin1_store = text;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        text ^= *xin1_load;
+#endif
+    }
 #endif
 
-	__local ulong State_buf[8 * 25];
+    __local ulong State_buf[8 * 25];
 #if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
+    // do not use early return here
+    if(gIdx < Threads)
 #endif
-	{
-		vstore2(as_ulong2(text), get_local_id(1) + 4, states);
-	}
+    {
+        vstore2(as_ulong2(text), lIdx + 4, states);
+    }
 
-	barrier(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_GLOBAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
+    // do not use early return here
+    if(gIdx < Threads)
 #endif
-	{
-		if(!get_local_id(1))
-		{
-			__local ulong* State = State_buf + get_local_id(0) * 25;
+    {
+        if(!lIdx)
+        {
+            __local ulong* State = State_buf + groupIdx * 25;
 
-			for(int i = 0; i < 25; ++i) State[i] = states[i];
+            for(int i = 0; i < 25; ++i) State[i] = states[i];
 
-			keccakf1600_2(State);
+            keccakf1600_2(State);
 
 #if (ALGO == cryptonight_gpu)
 			if(State[3] <= Target)
 			{
 				ulong outIdx = atomic_inc(output + 0xFF);
 				if(outIdx < 0xFF)
-					output[outIdx] = get_global_id(0);
+					output[outIdx] = get_global_id(1);
 			}
 #else
-			for(int i = 0; i < 25; ++i) states[i] = State[i];
+            for(int i = 0; i < 25; ++i) states[i] = State[i];
 
-			uint StateSwitch = State[0] & 3;
-			__global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
-			__global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
-			__global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
-			destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
+            uint StateSwitch = State[0] & 3;
+            __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
+            __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
+            __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
+            destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
 #endif
-		}
-	}
-	mem_fence(CLK_GLOBAL_MEM_FENCE);
+        }
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
-R"===(
+	R"===(
 
 #define VSWAP8(x)	(((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \
 		  | (((x) >>  8) & 0x00000000FF000000UL) | (((x) <<  8) & 0x000000FF00000000UL) \
@@ -1307,7 +1356,42 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 #endif
 		ulong H[8], M[8];
 
-		for (uint i = 0; i < 3; ++i) {
+		// BUG: AMD driver 19.7.X crashs if this is written as loop
+		// Thx AMD for so bad software
+		uint i = 0;
+		{
+			((ulong8 *)M)[0] = vload8(i, states);
+
+			for (uint x = 0; x < 8; ++x) {
+			    H[x] = M[x] ^ State[x];
+			}
+
+			PERM_SMALL_P(H);
+			PERM_SMALL_Q(M);
+
+			for (uint x = 0; x < 8; ++x)
+			{
+			    State[x] ^= H[x] ^ M[x];
+			}
+		}
+		i = 1;
+		{
+			((ulong8 *)M)[0] = vload8(i, states);
+
+			for (uint x = 0; x < 8; ++x) {
+			    H[x] = M[x] ^ State[x];
+			}
+
+			PERM_SMALL_P(H);
+			PERM_SMALL_Q(M);
+
+			for (uint x = 0; x < 8; ++x)
+			{
+			    State[x] ^= H[x] ^ M[x];
+			}
+		}
+		i = 2;
+		{
 			((ulong8 *)M)[0] = vload8(i, states);
 
 			for (uint x = 0; x < 8; ++x) {
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
index e87819760..bb37581f2 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
@@ -84,7 +84,7 @@ inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4
 }
 
 )==="
-R"===(
+	R"===(
 
 static const __constant uint look[16][4] = {
 	{0, 1, 2, 3},
@@ -220,7 +220,7 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u
 }
 
 )==="
-R"===(
+	R"===(
 
 static const __constant uint skip[3] = {
 	20,22,22
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl
similarity index 88%
rename from xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl
rename to xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl
index 9edb774ad..cdb5aef3e 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl
@@ -1,4 +1,5 @@
 R"===(
+
 /*
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -15,29 +16,15 @@ R"===(
  *
  */
 
-#define cryptonight_r_wow 15
-#define cryptonight_r 16
-
-#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
-
-#if(STRIDED_INDEX==0)
-#   define IDX(x)	(x)
-#elif(STRIDED_INDEX==1)
-#	define IDX(x)   (mul24(((uint)(x)), Threads))
-#elif(STRIDED_INDEX==2)
-#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
-#elif(STRIDED_INDEX==3)
-#	define IDX(x)   ((x) * WORKSIZE)
-#endif
-
+#ifndef SCRATCHPAD_CHUNK
 // __NV_CL_C_VERSION checks if NVIDIA opencl is used
-#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
-#	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
-#	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
-#else
-#	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)])
+#	if((ALGO == cryptonight_r_wow || ALGO == cryptonight_r) && defined(__NV_CL_C_VERSION))
+#		define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
+#		define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#	else
+#		define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)])
+#	endif
 #endif
-
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
@@ -162,7 +149,9 @@ __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *stat
 #endif
 #define ROT_BITS 32
 
-	XMRSTAK_INCLUDE_RANDOM_MATH
+XMRSTAK_INCLUDE_RANDOM_MATH
+
+#undef ROT_BITS
 
 #if (ALGO == cryptonight_r)
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl
new file mode 100644
index 000000000..2c318fcbf
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl
@@ -0,0 +1,33 @@
+R"===(
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define cryptonight_r_wow 15
+#define cryptonight_r 16
+
+#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
+
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#elif(STRIDED_INDEX==1)
+#	define IDX(x)   (mul24(((uint)(x)), Threads))
+#elif(STRIDED_INDEX==2)
+#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#elif(STRIDED_INDEX==3)
+#	define IDX(x)   ((x) * WORKSIZE)
+#endif
+
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
index 22603853f..02ce53e03 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
@@ -125,7 +125,7 @@ static const __constant ulong T0_G[] =
 };
 
 )==="
-R"===(
+	R"===(
 
 static const __constant ulong T4_G[] =
 {
@@ -292,4 +292,3 @@ static const __constant ulong T4_G[] =
 		} while (0)
 
 )==="
-
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
index 73ef90882..17abc3bc8 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
@@ -90,8 +90,85 @@ ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, con
 
 ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t)
 {
-	#pragma unroll
-	for(int i = 0; i < 18; ++i)
+	// BUG: AMD driver 19.7.X crashs if this is written as loop
+	// Thx AMD for so bad software
+	int i = 0;
+	{
+		p = SkeinEvenRound(p, h, t, 0U, i);
+		++i;
+		ulong tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 1U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinEvenRound(p, h, t, 2U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 0U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinEvenRound(p, h, t, 1U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 2U, i);
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		++i;
+	}
+	{
+		p = SkeinEvenRound(p, h, t, 0U, i);
+		++i;
+		ulong tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 1U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinEvenRound(p, h, t, 2U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 0U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinEvenRound(p, h, t, 1U, i);
+		++i;
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		p = SkeinOddRound(p, h, t, 2U, i);
+		tmp = h.s0;
+		h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+		h.s7 = h8;
+		h8 = tmp;
+		++i;
+	}
 	{
 		p = SkeinEvenRound(p, h, t, 0U, i);
 		++i;
@@ -129,7 +206,6 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t)
 		h.s7 = h8;
 		h8 = tmp;
 	}
-
 	p += h;
 	p.s5 += t[0];
 	p.s6 += t[1];
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 120fb6898..075acbd49 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -5,18 +5,18 @@
 #include "autoAdjust.hpp"
 #include "jconf.hpp"
 
-#include "xmrstak/misc/console.hpp"
-#include "xmrstak/misc/configEditor.hpp"
-#include "xmrstak/params.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/params.hpp"
 
-#include <vector>
+#include <algorithm>
 #include <cstdio>
+#include <iostream>
 #include <sstream>
 #include <string>
-#include <iostream>
-#include  <algorithm>
+#include <vector>
 
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
@@ -24,7 +24,6 @@
 #include <CL/cl.h>
 #endif
 
-
 namespace xmrstak
 {
 namespace amd
@@ -32,11 +31,9 @@ namespace amd
 
 class autoAdjust
 {
-public:
-
+  public:
 	autoAdjust()
 	{
-
 	}
 
 	/** print the adjusted values if needed
@@ -50,18 +47,17 @@ class autoAdjust
 
 		if(platformIndex == -1)
 		{
-			printer::inst()->print_msg(L0,"WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver.");
+			printer::inst()->print_msg(L0, "WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver.");
 			return false;
 		}
 
 		devVec = getAMDDevices(platformIndex);
 
-
 		int deviceCount = devVec.size();
 
 		if(deviceCount == 0)
 		{
-			printer::inst()->print_msg(L0,"WARNING: No AMD device found.");
+			printer::inst()->print_msg(L0, "WARNING: No AMD device found.");
 			return false;
 		}
 
@@ -69,17 +65,16 @@ class autoAdjust
 		return true;
 	}
 
-private:
-
+  private:
 	void generateThreadConfig(const int platformIndex)
 	{
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
+		const char* tpl =
+#include "./config.tpl"
+			;
 
 		configEditor configTpl{};
-		configTpl.set( std::string(tpl) );
+		configTpl.set(std::string(tpl));
 
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
@@ -94,6 +89,42 @@ class autoAdjust
 		std::string conf;
 		for(auto& ctx : devVec)
 		{
+			std::string enabledGpus = params::inst().amdGpus;
+			bool enabled = true;
+			if (!enabledGpus.empty())
+			{
+				enabled = false;
+				std::stringstream ss(enabledGpus);
+
+				int i = -1;
+				while (ss >> i)
+				{
+					if (i == ctx.deviceIdx)
+					{
+						enabled = true;
+						break;
+					}
+
+					while (ss.peek() == ',' || ss.peek() == ' ')
+						ss.ignore();
+				}
+			}
+
+			// check if cryptonight_monero_v8 is selected for the user or dev pool
+			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
+
+			// true for all cryptonight_heavy derivates since we check the user and dev pool
+			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
+
+			// true for cryptonight_gpu as main user pool algorithm
+			bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
+
+			bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r;
+
+			bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow;
+
+			// 8 threads per block (this is a good value for the most gpus)
+			uint32_t default_workSize = 8;
 			size_t minFreeMem = 128u * byteToMiB;
 			/* 1000 is a magic selected limit, the reason is that more than 2GiB memory
 			 * sowing down the memory performance because of TLB cache misses
@@ -107,42 +138,31 @@ class autoAdjust
 				// UNKNOWN
 				ctx.name.compare("gfx900") == 0 ||
 				ctx.name.compare("gfx903") == 0 ||
-				ctx.name.compare("gfx905") == 0
-			)
+				ctx.name.compare("gfx905") == 0 ||
+				// Radeon VII
+				ctx.name.compare("gfx906") == 0 ||
+				ctx.name.compare("Fiji") == 0)
 			{
 				/* Increase the number of threads for AMD VEGA gpus.
 				 * Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089
 				 * to avoid out of memory errors
 				 */
 				maxThreads = 2024u;
+
+				if(useCryptonight_gpu)
+					default_workSize = 16u;
 			}
 
 			// NVIDIA optimizations
 			if(
-				ctx.isNVIDIA && (
-					ctx.name.find("P100") != std::string::npos ||
-				    ctx.name.find("V100") != std::string::npos
-				)
-			)
+				ctx.isNVIDIA && (ctx.name.find("P100") != std::string::npos ||
+									ctx.name.find("V100") != std::string::npos))
 			{
 				// do not limit the number of threads
 				maxThreads = 40000u;
 				minFreeMem = 512u * byteToMiB;
 			}
 
-			// check if cryptonight_monero_v8 is selected for the user or dev pool
-			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
-
-			// true for all cryptonight_heavy derivates since we check the user and dev pool
-			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
-
-			// true for cryptonight_gpu as main user pool algorithm
-			bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
-
-			bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r;
-
-			bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow;
-
 			// set strided index to default
 			ctx.stridedIndex = 1;
 
@@ -164,6 +184,7 @@ class autoAdjust
 			}
 
 			uint32_t numUnroll = 8;
+			uint32_t numThreads = 1u;
 
 			if(useCryptonight_gpu)
 			{
@@ -171,7 +192,11 @@ class autoAdjust
 				// @todo check again after all optimizations
 				maxThreads = ctx.computeUnits * 6 * 8;
 				ctx.stridedIndex = 0;
-				numUnroll = 1;
+				// do not change unroll for AMD RX5700 but set 2 threads per gpu
+				if(ctx.name.compare("gfx1010") == 0)
+					numThreads = 2;
+				else
+					numUnroll = 1;
 			}
 
 			// keep 128MiB memory free (value is randomly chosen) from the max available memory
@@ -179,7 +204,6 @@ class autoAdjust
 
 			size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem);
 
-			uint32_t numThreads = 1u;
 			if(ctx.isAMD && !useCryptonight_gpu)
 			{
 				numThreads = 2;
@@ -190,34 +214,42 @@ class autoAdjust
 			// 240byte extra memory is used per thread for meta data
 			size_t perThread = hashMemSize + 240u;
 			size_t maxIntensity = memPerThread / perThread;
-			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
-			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
-			size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
-			// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
-			if(numThreads == 2)
-				intensity = (possibleIntensity / 8) * 8;
+			size_t possibleIntensity = std::min(maxThreads, maxIntensity);
+			// map intensity to a multiple of the compute unit count, default_workSize is the number of threads per work group
+			size_t intensity = (possibleIntensity / (default_workSize * ctx.computeUnits)) * ctx.computeUnits * default_workSize;
+
+			size_t computeUnitUtilization = ((possibleIntensity * 100)  / (default_workSize * ctx.computeUnits)) % 100;
+			// in the case we use two threads per gpu or if we can utilize over 75% of the compute units
+			// we can be relax and need no multiple of the number of compute units
+			if(numThreads == 2 || computeUnitUtilization >= 75)
+				intensity = (possibleIntensity / default_workSize) * default_workSize;
 
 			//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
-			if (intensity == 0)
+			if(intensity == 0)
 			{
 				printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT.");
 				intensity = possibleIntensity;
-
 			}
-			if (intensity != 0)
+			if(intensity != 0)
 			{
+				if (!enabled)
+					conf += "/* Disabled\n";
+
 				for(uint32_t thd = 0; thd < numThreads; ++thd)
 				{
 					conf += "  // gpu: " + ctx.name + std::string("  compute units: ") + std::to_string(ctx.computeUnits) + "\n";
 					conf += "  // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
-						std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" +  std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
-					// set 8 threads per block (this is a good value for the most gpus)
+							std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
 					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
-						"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-						"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-						"    \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
-						"  },\n";
+							"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(default_workSize) + ",\n" +
+							"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+																													   "    \"unroll\" : " +
+							std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+							"  },\n";
 				}
+
+				if (!enabled)
+					conf += "*/\n";
 			}
 			else
 			{
@@ -225,8 +257,8 @@ class autoAdjust
 			}
 		}
 
-		configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex));
-		configTpl.replace("GPUCONFIG",conf);
+		configTpl.replace("PLATFORMINDEX", std::to_string(platformIndex));
+		configTpl.replace("GPUCONFIG", conf);
 		configTpl.write(params::inst().configFileAMD);
 
 		const std::string backendName = xmrstak::params::inst().openCLVendor;
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index d3dc00d01..c5a63c56f 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -21,10 +21,9 @@
   *
   */
 
-
 #include "jconf.hpp"
-#include "xmrstak/misc/jext.hpp"
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/jext.hpp"
 
 #ifdef _WIN32
 #define strcasecmp _stricmp
@@ -37,7 +36,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-
 namespace xmrstak
 {
 namespace amd
@@ -48,9 +46,14 @@ using namespace rapidjson;
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum { aGpuThreadsConf, iPlatformIdx };
+enum configEnum
+{
+	aGpuThreadsConf,
+	iPlatformIdx
+};
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -59,24 +62,25 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aGpuThreadsConf, "gpu_threads_conf", kNullType },
-	{ iPlatformIdx, "platform_index", kNumberType }
-};
-
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+	{aGpuThreadsConf, "gpu_threads_conf", kNullType},
+	{iPlatformIdx, "platform_index", kNumberType}};
 
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
-enum optionalConfigEnum { iAutoTune };
+enum optionalConfigEnum
+{
+	iAutoTune
+};
 
-struct optionalConfigVal {
+struct optionalConfigVal
+{
 	optionalConfigEnum iName;
 	const char* sName;
 	Type iType;
 };
 
 optionalConfigVal oOptionalConfigValues[] = {
-	{ iAutoTune, "auto_tune", kNumberType }
-};
+	{iAutoTune, "auto_tune", kNumberType}};
 
 inline bool checkType(Type have, Type want)
 {
@@ -109,7 +113,7 @@ jconf::jconf()
 	prv = new opaque_private();
 }
 
-bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
+bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg)
 {
 	if(id >= prv->configValues[aGpuThreadsConf]->Size())
 		return false;
@@ -176,7 +180,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
-	if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 )
+	if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18)
 	{
 		printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
 		return false;
@@ -215,7 +219,7 @@ size_t jconf::GetPlatformIdx()
 size_t jconf::GetAutoTune()
 {
 	const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName);
-	if( value != nullptr && value->IsUint64())
+	if(value != nullptr && value->IsUint64())
 	{
 		return value->GetUint64();
 	}
@@ -233,22 +237,22 @@ size_t jconf::GetThreadCount()
 
 bool jconf::parse_config(const char* sFilename)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -262,7 +266,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -284,7 +288,7 @@ bool jconf::parse_config(const char* sFilename)
 	buffer[flen] = '}';
 	buffer[flen + 1] = '\0';
 
-	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	prv->jsonDoc.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(prv->jsonDoc.HasParseError())
@@ -294,7 +298,6 @@ bool jconf::parse_config(const char* sFilename)
 		return false;
 	}
 
-
 	if(!prv->jsonDoc.IsObject())
 	{ //This should never happen as we created the root ourselves
 		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
@@ -326,7 +329,7 @@ bool jconf::parse_config(const char* sFilename)
 
 	size_t n_thd = prv->configValues[aGpuThreadsConf]->Size();
 	thd_cfg c;
-	for(size_t i=0; i < n_thd; i++)
+	for(size_t i = 0; i < n_thd; i++)
 	{
 		if(!GetThreadConfig(i, c))
 		{
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 51a0c79ac..6f50c3059 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -12,16 +12,18 @@ namespace amd
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
-		if (oInst == nullptr) oInst = new jconf;
+		if(oInst == nullptr)
+			oInst = new jconf;
 		return oInst;
 	};
 
 	bool parse_config(const char* sFilename = params::inst().configFileAMD.c_str());
 
-	struct thd_cfg {
+	struct thd_cfg
+	{
 		size_t index;
 		size_t intensity;
 		size_t w_size;
@@ -34,18 +36,17 @@ class jconf
 	};
 
 	size_t GetThreadCount();
-	bool GetThreadConfig(size_t id, thd_cfg &cfg);
+	bool GetThreadConfig(size_t id, thd_cfg& cfg);
 
 	size_t GetAutoTune();
 	size_t GetPlatformIdx();
 
-private:
+  private:
 	jconf();
 	static jconf* oInst;
 
 	struct opaque_private;
 	opaque_private* prv;
-
 };
 
 } // namespace amd
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 3be593175..3a65de8e2 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -22,23 +22,23 @@
   */
 
 #include "minethd.hpp"
-#include "autoAdjust.hpp"
 #include "amd_gpu/gpu.hpp"
+#include "autoAdjust.hpp"
 
-#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
-#include "xmrstak/misc/configEditor.hpp"
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
+#include "xmrstak/backend/cpu/hwlocMemory.hpp"
 #include "xmrstak/backend/cpu/minethd.hpp"
 #include "xmrstak/jconf.hpp"
-#include "xmrstak/misc/executor.hpp"
+#include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/environment.hpp"
+#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
 
 #include <assert.h>
-#include <cmath>
 #include <chrono>
+#include <cmath>
 #include <thread>
 #include <vector>
 
@@ -53,6 +53,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 	oWork = pWork;
 	bQuit = 0;
 	iThreadNo = (uint8_t)iNo;
+	this->iGpuIndex = cfg.index;
 	iJobNo = 0;
 	iHashCount = 0;
 	iTimestamp = 0;
@@ -72,15 +73,16 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
 }
 
-extern "C"  {
+extern "C"
+{
 #ifdef WIN32
-__declspec(dllexport)
+	__declspec(dllexport)
 #endif
-std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
-{
-	environment::inst(&env);
-	return amd::minethd::thread_starter(threadOffset, pWork);
-}
+		std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
+	{
+		environment::inst(&env);
+		return amd::minethd::thread_starter(threadOffset, pWork);
+	}
 } // extern "C"
 
 bool minethd::init_gpus()
@@ -137,7 +139,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	pvThreads->reserve(n);
 
 	jconf::thd_cfg cfg;
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		jconf::inst()->GetThreadConfig(i, cfg);
 
@@ -161,7 +163,6 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	return pvThreads;
 }
 
-
 void minethd::work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
@@ -169,10 +170,9 @@ void minethd::work_main()
 
 	order_fix.set_value();
 	std::unique_lock<std::mutex> lck(thd_aff_set);
-	lck.release();
+	lck.unlock();
 	std::this_thread::yield();
 
-	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 
@@ -204,16 +204,16 @@ void minethd::work_main()
 	double bestHashrate = 0.0;
 	uint32_t bestIntensity = pGpuCtx->maxRawIntensity;
 
-	while (bQuit == 0)
+	while(bQuit == 0)
 	{
-		if (oWork.bStall)
+		if(oWork.bStall)
 		{
 			/* We are stalled here because the executor didn't find a job for us yet,
 			 * either because of network latency, or a socket problem. Since we are
 			 * raison d'etre of this software it us sensible to just wait until we have something
 			 */
 
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			globalStates::inst().consume_work(oWork, iJobNo);
@@ -267,14 +267,14 @@ void minethd::work_main()
 			uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave);
 
 			cl_uint results[0x100];
-			memset(results,0,sizeof(cl_uint)*(0x100));
+			memset(results, 0, sizeof(cl_uint) * (0x100));
 
 			XMRRunJob(pGpuCtx, results, miner_algo);
 
 			for(size_t i = 0; i < results[0xFF]; i++)
 			{
-				uint8_t	bWorkBlob[128];
-				uint8_t	bResult[32];
+				uint8_t bWorkBlob[128];
+				uint8_t bResult[32];
 
 				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
 				memset(bResult, 0, sizeof(job_result::bResult));
@@ -282,16 +282,13 @@ void minethd::work_main()
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
 				cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
-				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
+				if((*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
 					executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId));
 			}
 
-			iCount += pGpuCtx->rawIntensity;
-			uint64_t iStamp = get_timestamp_ms();
-			iHashCount.store(iCount, std::memory_order_relaxed);
-			iTimestamp.store(iStamp, std::memory_order_relaxed);
+			updateStats(pGpuCtx->rawIntensity, oWork.iPoolId);
 
 			accRuntime += updateTimings(pGpuCtx, t0);
 
@@ -317,20 +314,18 @@ void minethd::work_main()
 						// lock intensity to the best values
 						autoTune = 0;
 						pGpuCtx->rawIntensity = bestIntensity;
-						printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u",
+						printer::inst()->print_msg(L1, "OpenCL %u|%u: lock intensity at %u",
 							pGpuCtx->deviceIdx,
 							pGpuCtx->idWorkerOnDevice,
-							bestIntensity
-						);
+							bestIntensity);
 					}
 					else
 					{
-						printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u",
+						printer::inst()->print_msg(L1, "OpenCL %u|%u: auto-tune validate intensity %u|%u",
 							pGpuCtx->deviceIdx,
 							pGpuCtx->idWorkerOnDevice,
 							pGpuCtx->rawIntensity,
-							bestIntensity
-						);
+							bestIntensity);
 					}
 					// update gpu with new intensity
 					XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height);
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 402d63cd6..579abb1b5 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -3,27 +3,26 @@
 #include "amd_gpu/gpu.hpp"
 #include "jconf.hpp"
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
-#include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/misc/environment.hpp"
 
-#include <thread>
 #include <atomic>
 #include <future>
+#include <thread>
 
 namespace xmrstak
 {
 namespace amd
 {
 
-class minethd  : public iBackend
+class minethd : public iBackend
 {
-public:
-
+  public:
 	static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool init_gpus();
 
-private:
+  private:
 	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
index 0eea9fdd7..93a8fd9d6 100644
--- a/xmrstak/backend/backendConnector.cpp
+++ b/xmrstak/backend/backendConnector.cpp
@@ -21,31 +21,30 @@
   *
   */
 
-#include "iBackend.hpp"
 #include "backendConnector.hpp"
-#include "miner_work.hpp"
 #include "globalStates.hpp"
+#include "iBackend.hpp"
+#include "miner_work.hpp"
 #include "plugin.hpp"
-#include "xmrstak/misc/environment.hpp"
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/environment.hpp"
 #include "xmrstak/params.hpp"
 
 #include "cpu/minethd.hpp"
 #ifndef CONF_NO_CUDA
-#	include "nvidia/minethd.hpp"
+#include "nvidia/minethd.hpp"
 #endif
 #ifndef CONF_NO_OPENCL
-#	include "amd/minethd.hpp"
+#include "amd/minethd.hpp"
 #endif
 
-#include <cstdlib>
 #include <assert.h>
-#include <cmath>
+#include <bitset>
 #include <chrono>
+#include <cmath>
+#include <cstdlib>
 #include <cstring>
 #include <thread>
-#include <bitset>
-
 
 namespace xmrstak
 {
@@ -82,31 +81,52 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 #ifndef CONF_NO_CUDA
 	if(params::inst().useNVIDIA)
 	{
+		bool disableNvidia = false;
+
 		plugin nvidiaplugin;
-		std::vector<std::string> libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"};
+#ifdef XMRSTAK_DEV_RELEASE
+		std::vector<std::string> libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend"};
+#	ifndef _WIN32
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+		bool cn_r_derivate =
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() ||
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end();
+
+		if(cn_r_derivate)
+		{
+			disableNvidia = true;
+			printer::inst()->print_msg(L0, "WARNING: The linux release binaries not support cryptonight_r derived coins for NVIDIA.");		
+		}
+#	endif
+#else
+		std::vector<std::string> libNames = {"xmrstak_cuda_backend"};
+#endif
 		size_t numWorkers = 0u;
 
-		for( const auto & name : libNames)
+		if(!disableNvidia)
 		{
-			printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str());
-			nvidiaplugin.load("NVIDIA", name);
-			std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-			if(nvidiaThreads != nullptr)
-			{
-				pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
-				numWorkers = nvidiaThreads->size();
-				delete nvidiaThreads;
-			}
-			else
-			{
-				// remove the plugin if we have found no GPUs
-				nvidiaplugin.unload();
-			}
-			// we found at leat one working GPU
-			if(numWorkers != 0)
+			for(const auto& name : libNames)
 			{
-				printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str());
-				break;
+				printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str());
+				nvidiaplugin.load("NVIDIA", name);
+				std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
+				if(nvidiaThreads != nullptr)
+				{
+					pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
+					numWorkers = nvidiaThreads->size();
+					delete nvidiaThreads;
+				}
+				else
+				{
+					// remove the plugin if we have found no GPUs
+					nvidiaplugin.unload();
+				}
+				// we found at leat one working GPU
+				if(numWorkers != 0)
+				{
+					printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str());
+					break;
+				}
 			}
 		}
 		if(numWorkers == 0)
diff --git a/xmrstak/backend/backendConnector.hpp b/xmrstak/backend/backendConnector.hpp
index 66d873e48..1f2cb8ff6 100644
--- a/xmrstak/backend/backendConnector.hpp
+++ b/xmrstak/backend/backendConnector.hpp
@@ -3,19 +3,18 @@
 #include "iBackend.hpp"
 #include "miner_work.hpp"
 
-#include <thread>
-#include <vector>
 #include <atomic>
 #include <mutex>
-
+#include <thread>
+#include <vector>
 
 namespace xmrstak
 {
 
-	struct BackendConnector
-	{
-		static std::vector<iBackend*>* thread_starter(miner_work& pWork);
-		static bool self_test();
-	};
+struct BackendConnector
+{
+	static std::vector<iBackend*>* thread_starter(miner_work& pWork);
+	static bool self_test();
+};
 
 } // namespace xmrstak
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index ba0e6984f..98c145004 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -2,12 +2,12 @@
 
 #include "jconf.hpp"
 
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/backend/cryptonight.hpp"
-#include "xmrstak/backend/cpu/cpuType.hpp"
 #include <string>
 
 #ifdef _WIN32
@@ -16,7 +16,6 @@
 #include <unistd.h>
 #endif // _WIN32
 
-
 namespace xmrstak
 {
 namespace cpu
@@ -24,8 +23,7 @@ namespace cpu
 
 class autoAdjust
 {
-public:
-
+  public:
 	bool printConfig()
 	{
 		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
@@ -42,10 +40,10 @@ class autoAdjust
 		configEditor configTpl{};
 
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
-		configTpl.set( std::string(tpl) );
+		const char* tpl =
+#include "./config.tpl"
+			;
+		configTpl.set(std::string(tpl));
 
 		std::string conf;
 
@@ -75,14 +73,14 @@ class autoAdjust
 				linux_layout ? "Linux" : "Windows");
 
 			uint32_t aff_id = 0;
-			for(uint32_t i=0; i < corecnt; i++)
+			for(uint32_t i = 0; i < corecnt; i++)
 			{
 				bool double_mode;
 
 				if(L3KB_size <= 0)
 					break;
 
-				double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt-i);
+				double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt - i);
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
@@ -110,14 +108,14 @@ class autoAdjust
 		if(useCryptonight_gpu)
 			conf += "*/\n";
 
-		configTpl.replace("CPUCONFIG",conf);
+		configTpl.replace("CPUCONFIG", conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
 
 		return true;
 	}
 
-private:
+  private:
 	bool detectL3Size()
 	{
 		int32_t cpu_info[4];
@@ -125,8 +123,8 @@ class autoAdjust
 
 		::jconf::cpuid(0, 0, cpu_info);
 		memcpy(cpustr, &cpu_info[1], 4);
-		memcpy(cpustr+4, &cpu_info[3], 4);
-		memcpy(cpustr+8, &cpu_info[2], 4);
+		memcpy(cpustr + 4, &cpu_info[3], 4);
+		memcpy(cpustr + 8, &cpu_info[2], 4);
 
 		if(strcmp(cpustr, "GenuineIntel") == 0)
 		{
@@ -139,7 +137,8 @@ class autoAdjust
 			}
 
 			L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) *
-				(get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024;
+							(get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) /
+						1024;
 
 			return true;
 		}
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index f09b1ebc0..f06b0d679 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -1,9 +1,9 @@
 #pragma once
 
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/backend/cryptonight.hpp"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -16,17 +16,15 @@
 #include <hwloc.h>
 #include <stdio.h>
 
-
 namespace xmrstak
 {
 namespace cpu
 {
 
-class autoAdjust
+class autoAdjustHwloc
 {
 public:
-
-	autoAdjust()
+	autoAdjustHwloc()
 	{
 		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
 
@@ -48,10 +46,10 @@ class autoAdjust
 		configEditor configTpl{};
 
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
-		configTpl.set( std::string(tpl) );
+		const char* tpl =
+#include "./config.tpl"
+			;
+		configTpl.set(std::string(tpl));
 
 		// if cryptonight_gpu is used we will disable cpu mining but provide a inactive config
 		bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
@@ -62,6 +60,7 @@ class autoAdjust
 			conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n";
 		}
 
+		bool is_successful = true;
 		try
 		{
 			std::vector<hwloc_obj_t> tlcs;
@@ -69,7 +68,7 @@ class autoAdjust
 			results.reserve(16);
 
 			findChildrenCaches(hwloc_get_root_obj(topology),
-				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); } );
+				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); });
 
 			if(tlcs.size() == 0)
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
@@ -88,34 +87,32 @@ class autoAdjust
 		}
 		catch(const std::runtime_error& err)
 		{
-			// \todo add fallback to default auto adjust
-			conf += std::string("    { \"low_power_mode\" : false");
-			conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
-			printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
+			is_successful = false;
+			printer::inst()->print_msg(L0, "Autoconf with hwloc FAILED: %s. Trying basic autoconf.", err.what());
 		}
 
 		if(useCryptonight_gpu)
 			conf += "*/\n";
 
-		configTpl.replace("CPUCONFIG",conf);
+		configTpl.replace("CPUCONFIG", conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
 		/* Destroy topology object. */
 		hwloc_topology_destroy(topology);
 
-		return true;
+		return is_successful;
 	}
 
-private:
+  private:
 	size_t hashMemSize = 0;
 	size_t halfHashMemSize = 0;
 
 	std::vector<uint32_t> results;
 
-	template<typename func>
+	template <typename func>
 	inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
 	{
-		for(size_t i=0; i < obj->arity; i++)
+		for(size_t i = 0; i < obj->arity; i++)
 		{
 			if(obj->children[i]->type == type)
 				lambda(obj->children[i]);
@@ -133,10 +130,10 @@ class autoAdjust
 #endif // HWLOC_API_VERSION
 	}
 
-	template<typename func>
+	template <typename func>
 	inline void findChildrenCaches(hwloc_obj_t obj, func lambda)
 	{
-		for(size_t i=0; i < obj->arity; i++)
+		for(size_t i = 0; i < obj->arity; i++)
 		{
 			if(isCacheObject(obj->children[i]))
 				lambda(obj->children[i]);
@@ -159,7 +156,7 @@ class autoAdjust
 			throw(std::runtime_error("Cache object hasn't got attributes."));
 
 		size_t PUs = 0;
-		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; } );
+		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; });
 
 		//Strange case, but we will handle it silently, surely there must be one PU somewhere?
 		if(PUs == 0)
@@ -172,7 +169,7 @@ class autoAdjust
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
 
 			//Try our luck with lower level caches
-			for(size_t i=0; i < obj->arity; i++)
+			for(size_t i = 0; i < obj->arity; i++)
 				processTopLevelCache(obj->children[i]);
 			return;
 		}
@@ -180,7 +177,7 @@ class autoAdjust
 		size_t cacheSize = obj->attr->cache.size;
 		if(isCacheExclusive(obj))
 		{
-			for(size_t i=0; i < obj->arity; i++)
+			for(size_t i = 0; i < obj->arity; i++)
 			{
 				hwloc_obj_t l2obj = obj->children[i];
 				//If L2 is exclusive and greater or equal to 2MB add room for one more hash
@@ -191,7 +188,7 @@ class autoAdjust
 
 		std::vector<hwloc_obj_t> cores;
 		cores.reserve(16);
-		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); } );
+		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); });
 
 		size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize;
 
diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp
index c85682d4f..5e2519c3b 100644
--- a/xmrstak/backend/cpu/cpuType.cpp
+++ b/xmrstak/backend/cpu/cpuType.cpp
@@ -1,9 +1,9 @@
 
 #include "xmrstak/backend/cpu/cpuType.hpp"
 
+#include <cstdio>
 #include <cstring>
 #include <inttypes.h>
-#include <cstdio>
 
 #ifdef _WIN32
 #define strcasecmp _stricmp
@@ -16,64 +16,63 @@ namespace xmrstak
 {
 namespace cpu
 {
-	void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
-	{
-		std::memset(val, 0, sizeof(int32_t)*4);
-
-	#ifdef _WIN32
-		__cpuidex(val, eax, ecx);
-	#else
-		__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
-	#endif
-	}
-
-	int32_t get_masked(int32_t val, int32_t h, int32_t l)
-	{
-		val &= (0x7FFFFFFF >> (31-(h-l))) << l;
-		return val >> l;
-	}
+void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+{
+	std::memset(val, 0, sizeof(int32_t) * 4);
 
-	bool has_feature(int32_t val, int32_t bit)
-	{
-		int32_t mask = 1 << bit;
-		return (val & mask) != 0u;
+#ifdef _WIN32
+	__cpuidex(val, eax, ecx);
+#else
+	__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+#endif
+}
 
-	}
+int32_t get_masked(int32_t val, int32_t h, int32_t l)
+{
+	val &= (0x7FFFFFFF >> (31 - (h - l))) << l;
+	return val >> l;
+}
 
-	Model getModel()
-	{
-		int32_t cpu_info[4];
-		char cpustr[13] = {0};
+bool has_feature(int32_t val, int32_t bit)
+{
+	int32_t mask = 1 << bit;
+	return (val & mask) != 0u;
+}
 
-		cpuid(0, 0, cpu_info);
-		std::memcpy(cpustr, &cpu_info[1], 4);
-		std::memcpy(cpustr+4, &cpu_info[3], 4);
-		std::memcpy(cpustr+8, &cpu_info[2], 4);
+Model getModel()
+{
+	int32_t cpu_info[4];
+	char cpustr[13] = {0};
 
-		Model result;
+	cpuid(0, 0, cpu_info);
+	std::memcpy(cpustr, &cpu_info[1], 4);
+	std::memcpy(cpustr + 4, &cpu_info[3], 4);
+	std::memcpy(cpustr + 8, &cpu_info[2], 4);
 
-		cpuid(1, 0, cpu_info);
+	Model result;
 
-		result.family = get_masked(cpu_info[0], 12, 8);
-		result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
-		result.type_name = cpustr;
+	cpuid(1, 0, cpu_info);
 
-		// feature bits https://en.wikipedia.org/wiki/CPUID
-		// sse2
-		result.sse2 = has_feature(cpu_info[3], 26);
-		// aes-ni
-		result.aes = has_feature(cpu_info[2], 25);
-		// avx - 27 is the check if the OS overwrote cpu features
-		result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27) ;
+	result.family = get_masked(cpu_info[0], 12, 8);
+	result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
+	result.type_name = cpustr;
 
-		if(strcmp(cpustr, "AuthenticAMD") == 0)
-		{
-			if(result.family == 0xF)
-				result.family += get_masked(cpu_info[0], 28, 20);
-		}
+	// feature bits https://en.wikipedia.org/wiki/CPUID
+	// sse2
+	result.sse2 = has_feature(cpu_info[3], 26);
+	// aes-ni
+	result.aes = has_feature(cpu_info[2], 25);
+	// avx - 27 is the check if the OS overwrote cpu features
+	result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27);
 
-		return result;
+	if(strcmp(cpustr, "AuthenticAMD") == 0)
+	{
+		if(result.family == 0xF)
+			result.family += get_masked(cpu_info[0], 28, 20);
 	}
 
+	return result;
+}
+
 } // namespace cpu
 } // namespace xmrstak
diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp
index 7f6bfaf51..2bafa4105 100644
--- a/xmrstak/backend/cpu/cpuType.hpp
+++ b/xmrstak/backend/cpu/cpuType.hpp
@@ -1,32 +1,30 @@
 #pragma once
 
-#include <string>
 #include <cstdint>
-
+#include <string>
 
 namespace xmrstak
 {
 namespace cpu
 {
-	struct Model
-	{
-		uint32_t family = 0u;
-		uint32_t model = 0u;
-		bool aes = false;
-		bool sse2 = false;
-		bool avx = false;
-		std::string type_name = "unknown";
-	};
+struct Model
+{
+	uint32_t family = 0u;
+	uint32_t model = 0u;
+	bool aes = false;
+	bool sse2 = false;
+	bool avx = false;
+	std::string type_name = "unknown";
+};
 
-	Model getModel();
+Model getModel();
 
-	/** Mask bits between h and l and return the value
+/** Mask bits between h and l and return the value
 	 *
 	 * This enables us to put in values exactly like in the manual
 	 * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22)
 	 */
-	int32_t get_masked(int32_t val, int32_t h, int32_t l);
+int32_t get_masked(int32_t val, int32_t h, int32_t l);
 
-	
 } // namespace cpu
 } // namespace xmrstak
diff --git a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
index 2fc1a8baa..5d55987ac 100644
--- a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
+++ b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
@@ -1,77 +1,87 @@
 #include <cstring>
 
-typedef void(*void_func)();
+typedef void (*void_func)();
 
-#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h"
-#include "cryptonight_aesni.h"
 #include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h"
 #include "xmrstak/misc/console.hpp"
 
-static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)())
+static inline void add_code(uint8_t*& p, void (*p1)(), void (*p2)())
 {
-    const ptrdiff_t size = reinterpret_cast<const uint8_t*>(p2) - reinterpret_cast<const uint8_t*>(p1);
-    if (size > 0) {
-        memcpy(p, reinterpret_cast<void*>(p1), size);
-        p += size;
-    }
+	const ptrdiff_t size = reinterpret_cast<const uint8_t*>(p2) - reinterpret_cast<const uint8_t*>(p1);
+	if(size > 0)
+	{
+		memcpy(p, reinterpret_cast<void*>(p1), size);
+		p += size;
+	}
 }
 
-static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm)
+static inline void add_random_math(uint8_t*& p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm)
 {
-    uint32_t prev_rot_src = (uint32_t)(-1);
-
-    for (int i = 0;; ++i) {
-        const V4_Instruction inst = code[i];
-        if (inst.opcode == RET) {
-            break;
-        }
-
-        uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
-        uint8_t dst_index = inst.dst_index;
-        uint8_t src_index = inst.src_index;
-
-        const uint32_t a = inst.dst_index;
-        const uint32_t b = inst.src_index;
-        const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
-
-        switch (inst.opcode) {
-        case ROR:
-        case ROL:
-            if (b != prev_rot_src) {
-                prev_rot_src = b;
-                add_code(p, instructions_mov[c], instructions_mov[c + 1]);
-            }
-            break;
-        }
-
-        if (a == prev_rot_src) {
-            prev_rot_src = (uint32_t)(-1);
-        }
-
-        void_func begin = instructions[c];
+	uint32_t prev_rot_src = (uint32_t)(-1);
+
+	for(int i = 0;; ++i)
+	{
+		const V4_Instruction inst = code[i];
+		if(inst.opcode == RET)
+		{
+			break;
+		}
+
+		uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
+		uint8_t dst_index = inst.dst_index;
+		uint8_t src_index = inst.src_index;
+
+		const uint32_t a = inst.dst_index;
+		const uint32_t b = inst.src_index;
+		const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
+
+		switch(inst.opcode)
+		{
+		case ROR:
+		case ROL:
+			if(b != prev_rot_src)
+			{
+				prev_rot_src = b;
+				add_code(p, instructions_mov[c], instructions_mov[c + 1]);
+			}
+			break;
+		}
+
+		if(a == prev_rot_src)
+		{
+			prev_rot_src = (uint32_t)(-1);
+		}
+
+		void_func begin = instructions[c];
 
 		// AMD == 2
-        if ((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit)) {
-            // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
-            // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
-            uint8_t* prefix = reinterpret_cast<uint8_t*>(begin);
-
-            if (*prefix == 0x49) {
-                *(p++) = 0x41;
-            }
-
-            begin = reinterpret_cast<void_func>(prefix + 1);
-        }
-
-        add_code(p, begin, instructions[c + 1]);
-
-        if (inst.opcode == ADD) {
-            *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
-            if (is_64_bit) {
-                prev_rot_src = (uint32_t)(-1);
-            }
-        }
-    }
+		if((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit))
+		{
+			// AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
+			// Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
+			uint8_t* prefix = reinterpret_cast<uint8_t*>(begin);
+
+			if(*prefix == 0x49)
+			{
+				*(p++) = 0x41;
+			}
+
+			begin = reinterpret_cast<void_func>(prefix + 1);
+		}
+
+		add_code(p, begin, instructions[c + 1]);
+
+		if(inst.opcode == ADD)
+		{
+			*(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
+			if(is_64_bit)
+			{
+				prev_rot_src = (uint32_t)(-1);
+			}
+		}
+	}
 }
 
 void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size)
@@ -84,14 +94,14 @@ void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size)
 	else
 		unprotectExecutableMemory(ctx->fun_data, allocation_size);
 
-    uint8_t* p0 = ctx->fun_data;
-    uint8_t* p = p0;
+	uint8_t* p0 = ctx->fun_data;
+	uint8_t* p = p0;
 	if(ctx->fun_data != nullptr)
 	{
 
 		if(N == 2)
 		{
-		    add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
+			add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
 			add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
 			add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
 			add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.c b/xmrstak/backend/cpu/crypto/c_blake256.c
index e5fadfe74..93d9cadbb 100644
--- a/xmrstak/backend/cpu/crypto/c_blake256.c
+++ b/xmrstak/backend/cpu/crypto/c_blake256.c
@@ -8,66 +8,67 @@
  * HMAC is specified by RFC 2104.
  */
 
-#include <string.h>
-#include <stdio.h>
-#include <stdint.h>
 #include "c_blake256.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
 
-#define U8TO32(p) \
-	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |    \
-	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
-#define U32TO8(p, v) \
-	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
-	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+#define U8TO32(p)                                              \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+		((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3])))
+#define U32TO8(p, v)               \
+	(p)[0] = (uint8_t)((v) >> 24); \
+	(p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >> 8);  \
+	(p)[3] = (uint8_t)((v));
 
 const uint8_t sigma[][16] = {
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
-	{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
-	{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
-	{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8},
-	{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13},
-	{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9},
-	{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11},
-	{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10},
-	{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0},
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
-	{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
-	{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
-	{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8}
-};
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}};
 
 const uint32_t cst[16] = {
 	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
 	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
 	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
-	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
-};
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917};
 
 static const uint8_t padding[] = {
-	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-};
-
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
-void blake256_compress(state *S, const uint8_t *block) {
+void blake256_compress(state* S, const uint8_t* block)
+{
 	uint32_t v[16], m[16], i;
 
-#define ROT(x,n) (((x)<<(32-n))|((x)>>(n)))
-#define G(a,b,c,d,e)                                      \
-	v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e+1]]) + v[b]; \
-	v[d] = ROT(v[d] ^ v[a],16);                           \
-	v[c] += v[d];                                         \
-	v[b] = ROT(v[b] ^ v[c],12);                           \
-	v[a] += (m[sigma[i][e+1]] ^ cst[sigma[i][e]])+v[b];   \
-	v[d] = ROT(v[d] ^ v[a], 8);                           \
-	v[c] += v[d];                                         \
+#define ROT(x, n) (((x) << (32 - n)) | ((x) >> (n)))
+#define G(a, b, c, d, e)                                    \
+	v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e + 1]]) + v[b]; \
+	v[d] = ROT(v[d] ^ v[a], 16);                            \
+	v[c] += v[d];                                           \
+	v[b] = ROT(v[b] ^ v[c], 12);                            \
+	v[a] += (m[sigma[i][e + 1]] ^ cst[sigma[i][e]]) + v[b]; \
+	v[d] = ROT(v[d] ^ v[a], 8);                             \
+	v[c] += v[d];                                           \
 	v[b] = ROT(v[b] ^ v[c], 7);
 
-	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
-	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
-	v[ 8] = S->s[0] ^ 0x243F6A88;
-	v[ 9] = S->s[1] ^ 0x85A308D3;
+	for(i = 0; i < 16; ++i)
+		m[i] = U8TO32(block + i * 4);
+	for(i = 0; i < 8; ++i)
+		v[i] = S->h[i];
+	v[8] = S->s[0] ^ 0x243F6A88;
+	v[9] = S->s[1] ^ 0x85A308D3;
 	v[10] = S->s[2] ^ 0x13198A2E;
 	v[11] = S->s[3] ^ 0x03707344;
 	v[12] = 0xA4093822;
@@ -75,29 +76,34 @@ void blake256_compress(state *S, const uint8_t *block) {
 	v[14] = 0x082EFA98;
 	v[15] = 0xEC4E6C89;
 
-	if (S->nullt == 0) {
+	if(S->nullt == 0)
+	{
 		v[12] ^= S->t[0];
 		v[13] ^= S->t[0];
 		v[14] ^= S->t[1];
 		v[15] ^= S->t[1];
 	}
 
-	for (i = 0; i < 14; ++i) {
-		G(0, 4,  8, 12,  0);
-		G(1, 5,  9, 13,  2);
-		G(2, 6, 10, 14,  4);
-		G(3, 7, 11, 15,  6);
-		G(3, 4,  9, 14, 14);
-		G(2, 7,  8, 13, 12);
-		G(0, 5, 10, 15,  8);
+	for(i = 0; i < 14; ++i)
+	{
+		G(0, 4, 8, 12, 0);
+		G(1, 5, 9, 13, 2);
+		G(2, 6, 10, 14, 4);
+		G(3, 7, 11, 15, 6);
+		G(3, 4, 9, 14, 14);
+		G(2, 7, 8, 13, 12);
+		G(0, 5, 10, 15, 8);
 		G(1, 6, 11, 12, 10);
 	}
 
-	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
-	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+	for(i = 0; i < 16; ++i)
+		S->h[i % 8] ^= v[i];
+	for(i = 0; i < 8; ++i)
+		S->h[i] ^= S->s[i % 4];
 }
 
-void blake256_init(state *S) {
+void blake256_init(state* S)
+{
 	S->h[0] = 0x6A09E667;
 	S->h[1] = 0xBB67AE85;
 	S->h[2] = 0x3C6EF372;
@@ -110,7 +116,8 @@ void blake256_init(state *S) {
 	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
 }
 
-void blake224_init(state *S) {
+void blake224_init(state* S)
+{
 	S->h[0] = 0xC1059ED8;
 	S->h[1] = 0x367CD507;
 	S->h[2] = 0x3070DD17;
@@ -124,57 +131,75 @@ void blake224_init(state *S) {
 }
 
 // datalen = number of bits
-void blake256_update(state *S, const uint8_t *data, uint32_t datalen) {
+void blake256_update(state* S, const uint8_t* data, uint32_t datalen)
+{
 	int left = S->buflen >> 3;
 	int fill = 64 - left;
 
-	if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) {
-		memcpy((void *) (S->buf + left), (void *) data, fill);
+	if(left && (((datalen >> 3) & 0x3F) >= (unsigned)fill))
+	{
+		memcpy((void*)(S->buf + left), (void*)data, fill);
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		blake256_compress(S, S->buf);
 		data += fill;
 		datalen -= (fill << 3);
 		left = 0;
 	}
 
-	while (datalen >= 512) {
+	while(datalen >= 512)
+	{
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		blake256_compress(S, data);
 		data += 64;
 		datalen -= 512;
 	}
 
-	if (datalen > 0) {
-		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+	if(datalen > 0)
+	{
+		memcpy((void*)(S->buf + left), (void*)data, datalen >> 3);
 		S->buflen = (left << 3) + datalen;
-	} else {
+	}
+	else
+	{
 		S->buflen = 0;
 	}
 }
 
 // datalen = number of bits
-void blake224_update(state *S, const uint8_t *data, uint32_t datalen) {
+void blake224_update(state* S, const uint8_t* data, uint32_t datalen)
+{
 	blake256_update(S, data, datalen);
 }
 
-void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
+void blake256_final_h(state* S, uint8_t* digest, uint8_t pa, uint8_t pb)
+{
 	uint8_t msglen[8];
 	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
-	if (lo < (unsigned) S->buflen) hi++;
+	if(lo < (unsigned)S->buflen)
+		hi++;
 	U32TO8(msglen + 0, hi);
 	U32TO8(msglen + 4, lo);
 
-	if (S->buflen == 440) { /* one padding byte */
+	if(S->buflen == 440)
+	{ /* one padding byte */
 		S->t[0] -= 8;
 		blake256_update(S, &pa, 8);
-	} else {
-		if (S->buflen < 440) { /* enough space to fill the block  */
-			if (S->buflen == 0) S->nullt = 1;
+	}
+	else
+	{
+		if(S->buflen < 440)
+		{ /* enough space to fill the block  */
+			if(S->buflen == 0)
+				S->nullt = 1;
 			S->t[0] -= 440 - S->buflen;
 			blake256_update(S, padding, 440 - S->buflen);
-		} else { /* need 2 compressions */
+		}
+		else
+		{ /* need 2 compressions */
 			S->t[0] -= 512 - S->buflen;
 			blake256_update(S, padding, 512 - S->buflen);
 			S->t[0] -= 440;
@@ -187,9 +212,9 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
 	S->t[0] -= 64;
 	blake256_update(S, msglen, 64);
 
-	U32TO8(digest +  0, S->h[0]);
-	U32TO8(digest +  4, S->h[1]);
-	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 0, S->h[0]);
+	U32TO8(digest + 4, S->h[1]);
+	U32TO8(digest + 8, S->h[2]);
 	U32TO8(digest + 12, S->h[3]);
 	U32TO8(digest + 16, S->h[4]);
 	U32TO8(digest + 20, S->h[5]);
@@ -197,16 +222,19 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
 	U32TO8(digest + 28, S->h[7]);
 }
 
-void blake256_final(state *S, uint8_t *digest) {
+void blake256_final(state* S, uint8_t* digest)
+{
 	blake256_final_h(S, digest, 0x81, 0x01);
 }
 
-void blake224_final(state *S, uint8_t *digest) {
+void blake224_final(state* S, uint8_t* digest)
+{
 	blake256_final_h(S, digest, 0x80, 0x00);
 }
 
 // inlen = number of bytes
-void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
+void blake256_hash(uint8_t* out, const uint8_t* in, uint32_t inlen)
+{
 	state S;
 	blake256_init(&S);
 	blake256_update(&S, in, inlen * 8);
@@ -214,7 +242,8 @@ void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
 }
 
 // inlen = number of bytes
-void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
+void blake224_hash(uint8_t* out, const uint8_t* in, uint32_t inlen)
+{
 	state S;
 	blake224_init(&S);
 	blake224_update(&S, in, inlen * 8);
@@ -222,13 +251,15 @@ void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
 }
 
 // keylen = number of bytes
-void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
-	const uint8_t *key = _key;
+void hmac_blake256_init(hmac_state* S, const uint8_t* _key, uint64_t keylen)
+{
+	const uint8_t* key = _key;
 	uint8_t keyhash[32];
 	uint8_t pad[64];
 	uint64_t i;
 
-	if (keylen > 64) {
+	if(keylen > 64)
+	{
 		blake256_hash(keyhash, key, keylen);
 		key = keyhash;
 		keylen = 32;
@@ -236,14 +267,16 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 
 	blake256_init(&S->inner);
 	memset(pad, 0x36, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake256_update(&S->inner, pad, 512);
 
 	blake256_init(&S->outer);
 	memset(pad, 0x5c, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake256_update(&S->outer, pad, 512);
@@ -252,13 +285,15 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 }
 
 // keylen = number of bytes
-void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
-	const uint8_t *key = _key;
+void hmac_blake224_init(hmac_state* S, const uint8_t* _key, uint64_t keylen)
+{
+	const uint8_t* key = _key;
 	uint8_t keyhash[32];
 	uint8_t pad[64];
 	uint64_t i;
 
-	if (keylen > 64) {
+	if(keylen > 64)
+	{
 		blake256_hash(keyhash, key, keylen);
 		key = keyhash;
 		keylen = 28;
@@ -266,14 +301,16 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 
 	blake224_init(&S->inner);
 	memset(pad, 0x36, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake224_update(&S->inner, pad, 512);
 
 	blake224_init(&S->outer);
 	memset(pad, 0x5c, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake224_update(&S->outer, pad, 512);
@@ -282,18 +319,21 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 }
 
 // datalen = number of bits
-void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint32_t datalen) {
-  // update the inner state
-  blake256_update(&S->inner, data, datalen);
+void hmac_blake256_update(hmac_state* S, const uint8_t* data, uint32_t datalen)
+{
+	// update the inner state
+	blake256_update(&S->inner, data, datalen);
 }
 
 // datalen = number of bits
-void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint32_t datalen) {
-  // update the inner state
-  blake224_update(&S->inner, data, datalen);
+void hmac_blake224_update(hmac_state* S, const uint8_t* data, uint32_t datalen)
+{
+	// update the inner state
+	blake224_update(&S->inner, data, datalen);
 }
 
-void hmac_blake256_final(hmac_state *S, uint8_t *digest) {
+void hmac_blake256_final(hmac_state* S, uint8_t* digest)
+{
 	uint8_t ihash[32];
 	blake256_final(&S->inner, ihash);
 	blake256_update(&S->outer, ihash, 256);
@@ -301,7 +341,8 @@ void hmac_blake256_final(hmac_state *S, uint8_t *digest) {
 	memset(ihash, 0, 32);
 }
 
-void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
+void hmac_blake224_final(hmac_state* S, uint8_t* digest)
+{
 	uint8_t ihash[32];
 	blake224_final(&S->inner, ihash);
 	blake224_update(&S->outer, ihash, 224);
@@ -310,7 +351,8 @@ void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
 }
 
 // keylen = number of bytes; inlen = number of bytes
-void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) {
+void hmac_blake256_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen)
+{
 	hmac_state S;
 	hmac_blake256_init(&S, key, keylen);
 	hmac_blake256_update(&S, in, inlen * 8);
@@ -318,7 +360,8 @@ void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const
 }
 
 // keylen = number of bytes; inlen = number of bytes
-void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) {
+void hmac_blake224_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen)
+{
 	hmac_state S;
 	hmac_blake224_init(&S, key, keylen);
 	hmac_blake224_update(&S, in, inlen * 8);
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.h b/xmrstak/backend/cpu/crypto/c_blake256.h
index 06c7917af..9f63f88f4 100644
--- a/xmrstak/backend/cpu/crypto/c_blake256.h
+++ b/xmrstak/backend/cpu/crypto/c_blake256.h
@@ -3,41 +3,43 @@
 
 #include <stdint.h>
 
-typedef struct {
-  uint32_t h[8], s[4], t[2];
-  int buflen, nullt;
-  uint8_t buf[64];
+typedef struct
+{
+	uint32_t h[8], s[4], t[2];
+	int buflen, nullt;
+	uint8_t buf[64];
 } state;
 
-typedef struct {
-  state inner;
-  state outer;
+typedef struct
+{
+	state inner;
+	state outer;
 } hmac_state;
 
-void blake256_init(state *);
-void blake224_init(state *);
+void blake256_init(state*);
+void blake224_init(state*);
 
-void blake256_update(state *, const uint8_t *, uint32_t);
-void blake224_update(state *, const uint8_t *, uint32_t);
+void blake256_update(state*, const uint8_t*, uint32_t);
+void blake224_update(state*, const uint8_t*, uint32_t);
 
-void blake256_final(state *, uint8_t *);
-void blake224_final(state *, uint8_t *);
+void blake256_final(state*, uint8_t*);
+void blake224_final(state*, uint8_t*);
 
-void blake256_hash(uint8_t *, const uint8_t *, uint32_t);
-void blake224_hash(uint8_t *, const uint8_t *, uint32_t);
+void blake256_hash(uint8_t*, const uint8_t*, uint32_t);
+void blake224_hash(uint8_t*, const uint8_t*, uint32_t);
 
 /* HMAC functions: */
 
-void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t);
-void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t);
+void hmac_blake256_init(hmac_state*, const uint8_t*, uint64_t);
+void hmac_blake224_init(hmac_state*, const uint8_t*, uint64_t);
 
-void hmac_blake256_update(hmac_state *, const uint8_t *, uint32_t);
-void hmac_blake224_update(hmac_state *, const uint8_t *, uint32_t);
+void hmac_blake256_update(hmac_state*, const uint8_t*, uint32_t);
+void hmac_blake224_update(hmac_state*, const uint8_t*, uint32_t);
 
-void hmac_blake256_final(hmac_state *, uint8_t *);
-void hmac_blake224_final(hmac_state *, uint8_t *);
+void hmac_blake256_final(hmac_state*, uint8_t*);
+void hmac_blake224_final(hmac_state*, uint8_t*);
 
-void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t);
-void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t);
+void hmac_blake256_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t);
+void hmac_blake224_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t);
 
 #endif /* _BLAKE256_H_ */
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.c b/xmrstak/backend/cpu/crypto/c_groestl.c
index 5b3523e79..bae9a9f11 100644
--- a/xmrstak/backend/cpu/crypto/c_groestl.c
+++ b/xmrstak/backend/cpu/crypto/c_groestl.c
@@ -14,178 +14,185 @@
 #define P_TYPE 0
 #define Q_TYPE 1
 
-const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}};
-
-const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
-
-
-#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
-															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
-															v1 = temp_var;}
-
-
-#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
-   tu = T[2*(uint32_t)x[4*c0+0]];			    \
-   tl = T[2*(uint32_t)x[4*c0+0]+1];		    \
-   tv1 = T[2*(uint32_t)x[4*c1+1]];			\
-   tv2 = T[2*(uint32_t)x[4*c1+1]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
-   tu ^= tv1;						\
-   tl ^= tv2;						\
-   tv1 = T[2*(uint32_t)x[4*c2+2]];			\
-   tv2 = T[2*(uint32_t)x[4*c2+2]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
-   tu ^= tv1;						\
-   tl ^= tv2;   					\
-   tv1 = T[2*(uint32_t)x[4*c3+3]];			\
-   tv2 = T[2*(uint32_t)x[4*c3+3]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
-   tu ^= tv1;						\
-   tl ^= tv2;						\
-   tl ^= T[2*(uint32_t)x[4*c4+0]];			\
-   tu ^= T[2*(uint32_t)x[4*c4+0]+1];			\
-   tv1 = T[2*(uint32_t)x[4*c5+1]];			\
-   tv2 = T[2*(uint32_t)x[4*c5+1]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
-   tl ^= tv1;						\
-   tu ^= tv2;						\
-   tv1 = T[2*(uint32_t)x[4*c6+2]];			\
-   tv2 = T[2*(uint32_t)x[4*c6+2]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
-   tl ^= tv1;						\
-   tu ^= tv2;   					\
-   tv1 = T[2*(uint32_t)x[4*c7+3]];			\
-   tv2 = T[2*(uint32_t)x[4*c7+3]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
-   tl ^= tv1;						\
-   tu ^= tv2;						\
-   y[i] = tu;						\
-   y[i+1] = tl;
+const uint8_t shift_Values[2][8] = {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 3, 5, 7, 0, 2, 4, 6}};
 
+const uint8_t indices_cyclic[15] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6};
+
+#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var)                        \
+	{                                                                             \
+		temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \
+		v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes)));       \
+		v1 = temp_var;                                                            \
+	}
+
+#define COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \
+	tu = T[2 * (uint32_t)x[4 * c0 + 0]];                                     \
+	tl = T[2 * (uint32_t)x[4 * c0 + 0] + 1];                                 \
+	tv1 = T[2 * (uint32_t)x[4 * c1 + 1]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c1 + 1] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tu ^= tv1;                                                               \
+	tl ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c2 + 2]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c2 + 2] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tu ^= tv1;                                                               \
+	tl ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c3 + 3]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c3 + 3] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tu ^= tv1;                                                               \
+	tl ^= tv2;                                                               \
+	tl ^= T[2 * (uint32_t)x[4 * c4 + 0]];                                    \
+	tu ^= T[2 * (uint32_t)x[4 * c4 + 0] + 1];                                \
+	tv1 = T[2 * (uint32_t)x[4 * c5 + 1]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c5 + 1] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tl ^= tv1;                                                               \
+	tu ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c6 + 2]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c6 + 2] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tl ^= tv1;                                                               \
+	tu ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c7 + 3]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c7 + 3] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tl ^= tv1;                                                               \
+	tu ^= tv2;                                                               \
+	y[i] = tu;                                                               \
+	y[i + 1] = tl;
 
 /* compute one round of P (short variants) */
-static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) {
-  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
-  uint32_t* x32 = (uint32_t*)x;
-  x32[ 0] ^= 0x00000000^r;
-  x32[ 2] ^= 0x00000010^r;
-  x32[ 4] ^= 0x00000020^r;
-  x32[ 6] ^= 0x00000030^r;
-  x32[ 8] ^= 0x00000040^r;
-  x32[10] ^= 0x00000050^r;
-  x32[12] ^= 0x00000060^r;
-  x32[14] ^= 0x00000070^r;
-  COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+static void RND512P(uint8_t* x, uint32_t* y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[0] ^= 0x00000000 ^ r;
+	x32[2] ^= 0x00000010 ^ r;
+	x32[4] ^= 0x00000020 ^ r;
+	x32[6] ^= 0x00000030 ^ r;
+	x32[8] ^= 0x00000040 ^ r;
+	x32[10] ^= 0x00000050 ^ r;
+	x32[12] ^= 0x00000060 ^ r;
+	x32[14] ^= 0x00000070 ^ r;
+	COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
 /* compute one round of Q (short variants) */
-static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) {
-  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
-  uint32_t* x32 = (uint32_t*)x;
-  x32[ 0] = ~x32[ 0];
-  x32[ 1] ^= 0xffffffff^r;
-  x32[ 2] = ~x32[ 2];
-  x32[ 3] ^= 0xefffffff^r;
-  x32[ 4] = ~x32[ 4];
-  x32[ 5] ^= 0xdfffffff^r;
-  x32[ 6] = ~x32[ 6];
-  x32[ 7] ^= 0xcfffffff^r;
-  x32[ 8] = ~x32[ 8];
-  x32[ 9] ^= 0xbfffffff^r;
-  x32[10] = ~x32[10];
-  x32[11] ^= 0xafffffff^r;
-  x32[12] = ~x32[12];
-  x32[13] ^= 0x9fffffff^r;
-  x32[14] = ~x32[14];
-  x32[15] ^= 0x8fffffff^r;
-  COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+static void RND512Q(uint8_t* x, uint32_t* y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[0] = ~x32[0];
+	x32[1] ^= 0xffffffff ^ r;
+	x32[2] = ~x32[2];
+	x32[3] ^= 0xefffffff ^ r;
+	x32[4] = ~x32[4];
+	x32[5] ^= 0xdfffffff ^ r;
+	x32[6] = ~x32[6];
+	x32[7] ^= 0xcfffffff ^ r;
+	x32[8] = ~x32[8];
+	x32[9] ^= 0xbfffffff ^ r;
+	x32[10] = ~x32[10];
+	x32[11] ^= 0xafffffff ^ r;
+	x32[12] = ~x32[12];
+	x32[13] ^= 0x9fffffff ^ r;
+	x32[14] = ~x32[14];
+	x32[15] ^= 0x8fffffff ^ r;
+	COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
 /* compute compression function (short variants) */
-static void F512(uint32_t *h, const uint32_t *m) {
-  int i;
-  uint32_t Ptmp[2*COLS512];
-  uint32_t Qtmp[2*COLS512];
-  uint32_t y[2*COLS512];
-  uint32_t z[2*COLS512];
-
-  for (i = 0; i < 2*COLS512; i++) {
-	z[i] = m[i];
-	Ptmp[i] = h[i]^m[i];
-  }
-
-  /* compute Q(m) */
-  RND512Q((uint8_t*)z, y, 0x00000000);
-  RND512Q((uint8_t*)y, z, 0x01000000);
-  RND512Q((uint8_t*)z, y, 0x02000000);
-  RND512Q((uint8_t*)y, z, 0x03000000);
-  RND512Q((uint8_t*)z, y, 0x04000000);
-  RND512Q((uint8_t*)y, z, 0x05000000);
-  RND512Q((uint8_t*)z, y, 0x06000000);
-  RND512Q((uint8_t*)y, z, 0x07000000);
-  RND512Q((uint8_t*)z, y, 0x08000000);
-  RND512Q((uint8_t*)y, Qtmp, 0x09000000);
-
-  /* compute P(h+m) */
-  RND512P((uint8_t*)Ptmp, y, 0x00000000);
-  RND512P((uint8_t*)y, z, 0x00000001);
-  RND512P((uint8_t*)z, y, 0x00000002);
-  RND512P((uint8_t*)y, z, 0x00000003);
-  RND512P((uint8_t*)z, y, 0x00000004);
-  RND512P((uint8_t*)y, z, 0x00000005);
-  RND512P((uint8_t*)z, y, 0x00000006);
-  RND512P((uint8_t*)y, z, 0x00000007);
-  RND512P((uint8_t*)z, y, 0x00000008);
-  RND512P((uint8_t*)y, Ptmp, 0x00000009);
-
-  /* compute P(h+m) + Q(m) + h */
-  for (i = 0; i < 2*COLS512; i++) {
-	h[i] ^= Ptmp[i]^Qtmp[i];
-  }
-}
+static void F512(uint32_t* h, const uint32_t* m)
+{
+	int i;
+	uint32_t Ptmp[2 * COLS512];
+	uint32_t Qtmp[2 * COLS512];
+	uint32_t y[2 * COLS512];
+	uint32_t z[2 * COLS512];
+
+	for(i = 0; i < 2 * COLS512; i++)
+	{
+		z[i] = m[i];
+		Ptmp[i] = h[i] ^ m[i];
+	}
 
+	/* compute Q(m) */
+	RND512Q((uint8_t*)z, y, 0x00000000);
+	RND512Q((uint8_t*)y, z, 0x01000000);
+	RND512Q((uint8_t*)z, y, 0x02000000);
+	RND512Q((uint8_t*)y, z, 0x03000000);
+	RND512Q((uint8_t*)z, y, 0x04000000);
+	RND512Q((uint8_t*)y, z, 0x05000000);
+	RND512Q((uint8_t*)z, y, 0x06000000);
+	RND512Q((uint8_t*)y, z, 0x07000000);
+	RND512Q((uint8_t*)z, y, 0x08000000);
+	RND512Q((uint8_t*)y, Qtmp, 0x09000000);
+
+	/* compute P(h+m) */
+	RND512P((uint8_t*)Ptmp, y, 0x00000000);
+	RND512P((uint8_t*)y, z, 0x00000001);
+	RND512P((uint8_t*)z, y, 0x00000002);
+	RND512P((uint8_t*)y, z, 0x00000003);
+	RND512P((uint8_t*)z, y, 0x00000004);
+	RND512P((uint8_t*)y, z, 0x00000005);
+	RND512P((uint8_t*)z, y, 0x00000006);
+	RND512P((uint8_t*)y, z, 0x00000007);
+	RND512P((uint8_t*)z, y, 0x00000008);
+	RND512P((uint8_t*)y, Ptmp, 0x00000009);
+
+	/* compute P(h+m) + Q(m) + h */
+	for(i = 0; i < 2 * COLS512; i++)
+	{
+		h[i] ^= Ptmp[i] ^ Qtmp[i];
+	}
+}
 
 /* digest up to msglen bytes of input (full blocks only) */
-static void Transform(groestlHashState *ctx,
-	       const uint8_t *input,
-	       int msglen) {
+static void Transform(groestlHashState* ctx,
+	const uint8_t* input,
+	int msglen)
+{
 
-  /* digest message, one block at a time */
-  for (; msglen >= SIZE512;
-	   msglen -= SIZE512, input += SIZE512) {
-	F512(ctx->chaining,(uint32_t*)input);
+	/* digest message, one block at a time */
+	for(; msglen >= SIZE512;
+		msglen -= SIZE512, input += SIZE512)
+	{
+		F512(ctx->chaining, (uint32_t*)input);
 
-	/* increment block counter */
-	ctx->block_counter1++;
-	if (ctx->block_counter1 == 0) ctx->block_counter2++;
-  }
+		/* increment block counter */
+		ctx->block_counter1++;
+		if(ctx->block_counter1 == 0)
+			ctx->block_counter2++;
+	}
 }
 
 /* given state h, do h <- P(h)+h */
-static void OutputTransformation(groestlHashState *ctx) {
-  int j;
-  uint32_t temp[2*COLS512];
-  uint32_t y[2*COLS512];
-  uint32_t z[2*COLS512];
-
-
-
-	for (j = 0; j < 2*COLS512; j++) {
-	  temp[j] = ctx->chaining[j];
+static void OutputTransformation(groestlHashState* ctx)
+{
+	int j;
+	uint32_t temp[2 * COLS512];
+	uint32_t y[2 * COLS512];
+	uint32_t z[2 * COLS512];
+
+	for(j = 0; j < 2 * COLS512; j++)
+	{
+		temp[j] = ctx->chaining[j];
 	}
 	RND512P((uint8_t*)temp, y, 0x00000000);
 	RND512P((uint8_t*)y, z, 0x00000001);
@@ -197,75 +204,84 @@ static void OutputTransformation(groestlHashState *ctx) {
 	RND512P((uint8_t*)y, z, 0x00000007);
 	RND512P((uint8_t*)z, y, 0x00000008);
 	RND512P((uint8_t*)y, temp, 0x00000009);
-	for (j = 0; j < 2*COLS512; j++) {
-	  ctx->chaining[j] ^= temp[j];
+	for(j = 0; j < 2 * COLS512; j++)
+	{
+		ctx->chaining[j] ^= temp[j];
 	}
 }
 
 /* initialise context */
-static void Init(groestlHashState* ctx) {
-  int i = 0;
-  /* allocate memory for state and data buffer */
-
-  for(;i<(SIZE512/sizeof(uint32_t));i++)
-  {
-	ctx->chaining[i] = 0;
-  }
-
-  /* set initial value */
-  ctx->chaining[2*COLS512-1] = u32BIG((uint32_t)HASH_BIT_LEN);
-
-  /* set other variables */
-  ctx->buf_ptr = 0;
-  ctx->block_counter1 = 0;
-  ctx->block_counter2 = 0;
-  ctx->bits_in_last_byte = 0;
+static void Init(groestlHashState* ctx)
+{
+	int i = 0;
+	/* allocate memory for state and data buffer */
+
+	for(; i < (SIZE512 / sizeof(uint32_t)); i++)
+	{
+		ctx->chaining[i] = 0;
+	}
+
+	/* set initial value */
+	ctx->chaining[2 * COLS512 - 1] = u32BIG((uint32_t)HASH_BIT_LEN);
+
+	/* set other variables */
+	ctx->buf_ptr = 0;
+	ctx->block_counter1 = 0;
+	ctx->block_counter2 = 0;
+	ctx->bits_in_last_byte = 0;
 }
 
 /* update state with databitlen bits of input */
 static void Update(groestlHashState* ctx,
-		  const BitSequence* input,
-		  DataLength databitlen) {
-  int index = 0;
-  int msglen = (int)(databitlen/8);
-  int rem = (int)(databitlen%8);
+	const BitSequence* input,
+	DataLength databitlen)
+{
+	int index = 0;
+	int msglen = (int)(databitlen / 8);
+	int rem = (int)(databitlen % 8);
 
-  /* if the buffer contains data that has not yet been digested, first
+	/* if the buffer contains data that has not yet been digested, first
 	 add data to buffer until full */
-  if (ctx->buf_ptr) {
-	while (ctx->buf_ptr < SIZE512 && index < msglen) {
-	  ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
-	}
-	if (ctx->buf_ptr < SIZE512) {
-	  /* buffer still not full, return */
-	  if (rem) {
-	ctx->bits_in_last_byte = rem;
-	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
-	  }
-	  return;
+	if(ctx->buf_ptr)
+	{
+		while(ctx->buf_ptr < SIZE512 && index < msglen)
+		{
+			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+		}
+		if(ctx->buf_ptr < SIZE512)
+		{
+			/* buffer still not full, return */
+			if(rem)
+			{
+				ctx->bits_in_last_byte = rem;
+				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+			}
+			return;
+		}
+
+		/* digest buffer */
+		ctx->buf_ptr = 0;
+		Transform(ctx, ctx->buffer, SIZE512);
 	}
 
-	/* digest buffer */
-	ctx->buf_ptr = 0;
-	Transform(ctx, ctx->buffer, SIZE512);
-  }
+	/* digest bulk of message */
+	Transform(ctx, input + index, msglen - index);
+	index += ((msglen - index) / SIZE512) * SIZE512;
 
-  /* digest bulk of message */
-  Transform(ctx, input+index, msglen-index);
-  index += ((msglen-index)/SIZE512)*SIZE512;
-
-  /* store remaining data in buffer */
-  while (index < msglen) {
-	ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
-  }
+	/* store remaining data in buffer */
+	while(index < msglen)
+	{
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+	}
 
-  /* if non-integral number of bytes have been supplied, store
+	/* if non-integral number of bytes have been supplied, store
 	 remaining bits in last byte, together with information about
 	 number of bits */
-  if (rem) {
-	ctx->bits_in_last_byte = rem;
-	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
-  }
+	if(rem)
+	{
+		ctx->bits_in_last_byte = rem;
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+	}
 }
 
 #define BILB ctx->bits_in_last_byte
@@ -273,80 +289,92 @@ static void Update(groestlHashState* ctx,
 /* finalise: process remaining data (including padding), perform
    output transformation, and write hash result to 'output' */
 static void Final(groestlHashState* ctx,
-		 BitSequence* output) {
-  int i, j = 0, hashbytelen = HASH_BIT_LEN/8;
-  uint8_t *s = (BitSequence*)ctx->chaining;
-
-  /* pad with '1'-bit and first few '0'-bits */
-  if (BILB) {
-	ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
-	ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
-	BILB = 0;
-  }
-  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
-
-  /* pad with '0'-bits */
-  if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
-	/* padding requires two blocks */
-	while (ctx->buf_ptr < SIZE512) {
-	  ctx->buffer[(int)ctx->buf_ptr++] = 0;
+	BitSequence* output)
+{
+	int i, j = 0, hashbytelen = HASH_BIT_LEN / 8;
+	uint8_t* s = (BitSequence*)ctx->chaining;
+
+	/* pad with '1'-bit and first few '0'-bits */
+	if(BILB)
+	{
+		ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << BILB) - 1) << (8 - BILB);
+		ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - BILB);
+		BILB = 0;
+	}
+	else
+		ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+	/* pad with '0'-bits */
+	if(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN)
+	{
+		/* padding requires two blocks */
+		while(ctx->buf_ptr < SIZE512)
+		{
+			ctx->buffer[(int)ctx->buf_ptr++] = 0;
+		}
+		/* digest first padding block */
+		Transform(ctx, ctx->buffer, SIZE512);
+		ctx->buf_ptr = 0;
 	}
-	/* digest first padding block */
+	while(ctx->buf_ptr < SIZE512 - LENGTHFIELDLEN)
+	{
+		ctx->buffer[(int)ctx->buf_ptr++] = 0;
+	}
+
+	/* length padding */
+	ctx->block_counter1++;
+	if(ctx->block_counter1 == 0)
+		ctx->block_counter2++;
+	ctx->buf_ptr = SIZE512;
+
+	while(ctx->buf_ptr > SIZE512 - (int)sizeof(uint32_t))
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
+		ctx->block_counter1 >>= 8;
+	}
+	while(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN)
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
+		ctx->block_counter2 >>= 8;
+	}
+	/* digest final padding block */
 	Transform(ctx, ctx->buffer, SIZE512);
-	ctx->buf_ptr = 0;
-  }
-  while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) {
-	ctx->buffer[(int)ctx->buf_ptr++] = 0;
-  }
-
-  /* length padding */
-  ctx->block_counter1++;
-  if (ctx->block_counter1 == 0) ctx->block_counter2++;
-  ctx->buf_ptr = SIZE512;
-
-  while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) {
-	ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
-	ctx->block_counter1 >>= 8;
-  }
-  while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
-	ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
-	ctx->block_counter2 >>= 8;
-  }
-  /* digest final padding block */
-  Transform(ctx, ctx->buffer, SIZE512);
-  /* perform output transformation */
-  OutputTransformation(ctx);
-
-  /* store hash result in output */
-  for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) {
-	output[j] = s[i];
-  }
-
-  /* zeroise relevant variables and deallocate memory */
-  for (i = 0; i < COLS512; i++) {
-	ctx->chaining[i] = 0;
-  }
-  for (i = 0; i < SIZE512; i++) {
-	ctx->buffer[i] = 0;
-  }
+	/* perform output transformation */
+	OutputTransformation(ctx);
+
+	/* store hash result in output */
+	for(i = SIZE512 - hashbytelen; i < SIZE512; i++, j++)
+	{
+		output[j] = s[i];
+	}
+
+	/* zeroise relevant variables and deallocate memory */
+	for(i = 0; i < COLS512; i++)
+	{
+		ctx->chaining[i] = 0;
+	}
+	for(i = 0; i < SIZE512; i++)
+	{
+		ctx->buffer[i] = 0;
+	}
 }
 
 /* hash bit sequence */
 void groestl(const BitSequence* data,
-		DataLength databitlen,
-		BitSequence* hashval) {
+	DataLength databitlen,
+	BitSequence* hashval)
+{
 
-  groestlHashState context;
+	groestlHashState context;
 
-  /* initialise */
+	/* initialise */
 	Init(&context);
 
+	/* process message */
+	Update(&context, data, databitlen);
 
-  /* process message */
-  Update(&context, data, databitlen);
-
-  /* finalise */
-  Final(&context, hashval);
+	/* finalise */
+	Final(&context, hashval);
 }
 /*
 static int crypto_hash(unsigned char *out,
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.h b/xmrstak/backend/cpu/crypto/c_groestl.h
index 47044b462..5322a2e2e 100644
--- a/xmrstak/backend/cpu/crypto/c_groestl.h
+++ b/xmrstak/backend/cpu/crypto/c_groestl.h
@@ -1,10 +1,10 @@
 #ifndef __hash_h
 #define __hash_h
 /*
-#include "crypto_uint8.h"
+#include "crypto_hash.h"
 #include "crypto_uint32.h"
 #include "crypto_uint64.h"
-#include "crypto_hash.h"
+#include "crypto_uint8.h"
 
 typedef crypto_uint8 uint8_t;
 typedef crypto_uint32 uint32_t;
@@ -19,29 +19,28 @@ typedef crypto_uint64 uint64_t;
 #define LENGTHFIELDLEN ROWS
 #define COLS512 8
 
-#define SIZE512 (ROWS*COLS512)
+#define SIZE512 (ROWS * COLS512)
 
 #define ROUNDS512 10
 #define HASH_BIT_LEN 256
 
-#define ROTL32(v, n) ((((v)<<(n))|((v)>>(32-(n))))&li_32(ffffffff))
-
+#define ROTL32(v, n) ((((v) << (n)) | ((v) >> (32 - (n)))) & li_32(ffffffff))
 
 #define li_32(h) 0x##h##u
-#define EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
-#define u32BIG(a)				\
-  ((ROTL32(a,8) & li_32(00FF00FF)) |		\
-   (ROTL32(a,24) & li_32(FF00FF00)))
-
+#define EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n)))
+#define u32BIG(a)                       \
+	((ROTL32(a, 8) & li_32(00FF00FF)) | \
+		(ROTL32(a, 24) & li_32(FF00FF00)))
 
 /* NIST API begin */
-typedef struct {
-  uint32_t chaining[SIZE512/sizeof(uint32_t)];            /* actual state */
-  uint32_t block_counter1,
-  block_counter2;         /* message block counter(s) */
-  BitSequence buffer[SIZE512];      /* data buffer */
-  int buf_ptr;              /* data buffer pointer */
-  int bits_in_last_byte;    /* no. of message bits in last byte of
+typedef struct
+{
+	uint32_t chaining[SIZE512 / sizeof(uint32_t)]; /* actual state */
+	uint32_t block_counter1,
+		block_counter2;			 /* message block counter(s) */
+	BitSequence buffer[SIZE512]; /* data buffer */
+	int buf_ptr;				 /* data buffer pointer */
+	int bits_in_last_byte;		 /* no. of message bits in last byte of
                                data buffer */
 } groestlHashState;
 
diff --git a/xmrstak/backend/cpu/crypto/c_jh.c b/xmrstak/backend/cpu/crypto/c_jh.c
index 0256a0fa2..e50886dee 100644
--- a/xmrstak/backend/cpu/crypto/c_jh.c
+++ b/xmrstak/backend/cpu/crypto/c_jh.c
@@ -23,345 +23,400 @@ typedef uint64_t uint64;
 
 /*define data alignment for different C compilers*/
 #if defined(__GNUC__)
-	  #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
+#define DATA_ALIGN16(x) x __attribute__((aligned(16)))
 #else
-	  #define DATA_ALIGN16(x) __declspec(align(16)) x
+#define DATA_ALIGN16(x) __declspec(align(16)) x
 #endif
 
-
-typedef struct {
-	int hashbitlen;	   	              /*the message digest size*/
-	unsigned long long databitlen;    /*the message size in bits*/
-	unsigned long long datasize_in_buffer;      /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
-	DATA_ALIGN16(uint64 x[8][2]);     /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/
-	unsigned char buffer[64];         /*the 512-bit message block to be hashed;*/
+typedef struct
+{
+	int hashbitlen;						   /*the message digest size*/
+	unsigned long long databitlen;		   /*the message size in bits*/
+	unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
+	DATA_ALIGN16(uint64 x[8][2]);		   /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/
+	unsigned char buffer[64];			   /*the 512-bit message block to be hashed;*/
 } hashState;
 
-
 /*The initial hash value H(0)*/
-const unsigned char JH224_H0[128]={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e};
-const unsigned char JH256_H0[128]={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69};
-const unsigned char JH384_H0[128]={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f};
-const unsigned char JH512_H0[128]={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
+const unsigned char JH224_H0[128] = {0x2d, 0xfe, 0xdd, 0x62, 0xf9, 0x9a, 0x98, 0xac, 0xae, 0x7c, 0xac, 0xd6, 0x19, 0xd6, 0x34, 0xe7, 0xa4, 0x83, 0x10, 0x5, 0xbc, 0x30, 0x12, 0x16, 0xb8, 0x60, 0x38, 0xc6, 0xc9, 0x66, 0x14, 0x94, 0x66, 0xd9, 0x89, 0x9f, 0x25, 0x80, 0x70, 0x6f, 0xce, 0x9e, 0xa3, 0x1b, 0x1d, 0x9b, 0x1a, 0xdc, 0x11, 0xe8, 0x32, 0x5f, 0x7b, 0x36, 0x6e, 0x10, 0xf9, 0x94, 0x85, 0x7f, 0x2, 0xfa, 0x6, 0xc1, 0x1b, 0x4f, 0x1b, 0x5c, 0xd8, 0xc8, 0x40, 0xb3, 0x97, 0xf6, 0xa1, 0x7f, 0x6e, 0x73, 0x80, 0x99, 0xdc, 0xdf, 0x93, 0xa5, 0xad, 0xea, 0xa3, 0xd3, 0xa4, 0x31, 0xe8, 0xde, 0xc9, 0x53, 0x9a, 0x68, 0x22, 0xb4, 0xa9, 0x8a, 0xec, 0x86, 0xa1, 0xe4, 0xd5, 0x74, 0xac, 0x95, 0x9c, 0xe5, 0x6c, 0xf0, 0x15, 0x96, 0xd, 0xea, 0xb5, 0xab, 0x2b, 0xbf, 0x96, 0x11, 0xdc, 0xf0, 0xdd, 0x64, 0xea, 0x6e};
+const unsigned char JH256_H0[128] = {0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69};
+const unsigned char JH384_H0[128] = {0x48, 0x1e, 0x3b, 0xc6, 0xd8, 0x13, 0x39, 0x8a, 0x6d, 0x3b, 0x5e, 0x89, 0x4a, 0xde, 0x87, 0x9b, 0x63, 0xfa, 0xea, 0x68, 0xd4, 0x80, 0xad, 0x2e, 0x33, 0x2c, 0xcb, 0x21, 0x48, 0xf, 0x82, 0x67, 0x98, 0xae, 0xc8, 0x4d, 0x90, 0x82, 0xb9, 0x28, 0xd4, 0x55, 0xea, 0x30, 0x41, 0x11, 0x42, 0x49, 0x36, 0xf5, 0x55, 0xb2, 0x92, 0x48, 0x47, 0xec, 0xc7, 0x25, 0xa, 0x93, 0xba, 0xf4, 0x3c, 0xe1, 0x56, 0x9b, 0x7f, 0x8a, 0x27, 0xdb, 0x45, 0x4c, 0x9e, 0xfc, 0xbd, 0x49, 0x63, 0x97, 0xaf, 0xe, 0x58, 0x9f, 0xc2, 0x7d, 0x26, 0xaa, 0x80, 0xcd, 0x80, 0xc0, 0x8b, 0x8c, 0x9d, 0xeb, 0x2e, 0xda, 0x8a, 0x79, 0x81, 0xe8, 0xf8, 0xd5, 0x37, 0x3a, 0xf4, 0x39, 0x67, 0xad, 0xdd, 0xd1, 0x7a, 0x71, 0xa9, 0xb4, 0xd3, 0xbd, 0xa4, 0x75, 0xd3, 0x94, 0x97, 0x6c, 0x3f, 0xba, 0x98, 0x42, 0x73, 0x7f};
+const unsigned char JH512_H0[128] = {0x6f, 0xd1, 0x4b, 0x96, 0x3e, 0x0, 0xaa, 0x17, 0x63, 0x6a, 0x2e, 0x5, 0x7a, 0x15, 0xd5, 0x43, 0x8a, 0x22, 0x5e, 0x8d, 0xc, 0x97, 0xef, 0xb, 0xe9, 0x34, 0x12, 0x59, 0xf2, 0xb3, 0xc3, 0x61, 0x89, 0x1d, 0xa0, 0xc1, 0x53, 0x6f, 0x80, 0x1e, 0x2a, 0xa9, 0x5, 0x6b, 0xea, 0x2b, 0x6d, 0x80, 0x58, 0x8e, 0xcc, 0xdb, 0x20, 0x75, 0xba, 0xa6, 0xa9, 0xf, 0x3a, 0x76, 0xba, 0xf8, 0x3b, 0xf7, 0x1, 0x69, 0xe6, 0x5, 0x41, 0xe3, 0x4a, 0x69, 0x46, 0xb5, 0x8a, 0x8e, 0x2e, 0x6f, 0xe6, 0x5a, 0x10, 0x47, 0xa7, 0xd0, 0xc1, 0x84, 0x3c, 0x24, 0x3b, 0x6e, 0x71, 0xb1, 0x2d, 0x5a, 0xc1, 0x99, 0xcf, 0x57, 0xf6, 0xec, 0x9d, 0xb1, 0xf8, 0x56, 0xa7, 0x6, 0x88, 0x7c, 0x57, 0x16, 0xb1, 0x56, 0xe3, 0xc2, 0xfc, 0xdf, 0xe6, 0x85, 0x17, 0xfb, 0x54, 0x5a, 0x46, 0x78, 0xcc, 0x8c, 0xdd, 0x4b};
 
 /*42 round constants, each round constant is 32-byte (256-bit)*/
-const unsigned char E8_bitslice_roundconstant[42][32]={
-{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
-{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
-{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
-{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
-{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
-{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
-{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
-{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
-{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
-{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
-{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
-{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
-{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
-{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
-{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
-{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
-{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
-{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
-{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
-{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
-{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
-{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
-{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
-{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
-{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
-{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
-{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
-{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
-{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
-{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
-{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
-{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
-{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
-{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
-{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
-{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
-{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
-{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
-{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
-{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
-{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
-{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
-
-
-static void E8(hashState *state);  /*The bijective function E8, in bitslice form*/
-static void F8(hashState *state);  /*The compression function F8 */
+const unsigned char E8_bitslice_roundconstant[42][32] = {
+	{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
+	{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
+	{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
+	{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
+	{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
+	{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
+	{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
+	{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
+	{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
+	{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
+	{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
+	{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
+	{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
+	{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
+	{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
+	{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
+	{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
+	{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
+	{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
+	{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
+	{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
+	{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
+	{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
+	{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
+	{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
+	{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
+	{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
+	{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
+	{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
+	{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
+	{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
+	{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
+	{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
+	{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
+	{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
+	{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
+	{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
+	{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
+	{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
+	{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
+	{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
+	{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}};
+
+static void E8(hashState* state); /*The bijective function E8, in bitslice form*/
+static void F8(hashState* state); /*The compression function F8 */
 
 /*The API functions*/
-static HashReturn Init(hashState *state, int hashbitlen);
-static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
-static HashReturn Final(hashState *state, BitSequence *hashval);
-HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval);
+static HashReturn Init(hashState* state, int hashbitlen);
+static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen);
+static HashReturn Final(hashState* state, BitSequence* hashval);
+HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval);
 
 /*swapping bit 2i with bit 2i+1 of 64-bit x*/
-#define SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
+#define SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1));
 /*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 64-bit x*/
-#define SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
+#define SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2));
 /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 64-bit x*/
-#define SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
+#define SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4));
 /*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 64-bit x*/
-#define SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
+#define SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8));
 /*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 64-bit x*/
-#define SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
+#define SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16));
 /*swapping bits 64i||64i+1||......||64i+31 with bits 64i+32||64i+33||......||64i+63 of 64-bit x*/
-#define SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
+#define SWAP32(x) (x) = (((x) << 32) | ((x) >> 32));
 
 /*The MDS transform*/
-#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
-	  (m4) ^= (m1);                \
-	  (m5) ^= (m2);                \
-	  (m6) ^= (m0) ^ (m3);         \
-	  (m7) ^= (m0);                \
-	  (m0) ^= (m5);                \
-	  (m1) ^= (m6);                \
-	  (m2) ^= (m4) ^ (m7);         \
-	  (m3) ^= (m4);
+#define L(m0, m1, m2, m3, m4, m5, m6, m7) \
+	(m4) ^= (m1);                         \
+	(m5) ^= (m2);                         \
+	(m6) ^= (m0) ^ (m3);                  \
+	(m7) ^= (m0);                         \
+	(m0) ^= (m5);                         \
+	(m1) ^= (m6);                         \
+	(m2) ^= (m4) ^ (m7);                  \
+	(m3) ^= (m4);
 
 /*Two Sboxes are computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/
 /*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power*/
-#define SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
-	  m3  = ~(m3);                  \
-	  m7  = ~(m7);                  \
-	  m0 ^= ((~(m2)) & (cc0));      \
-	  m4 ^= ((~(m6)) & (cc1));      \
-	  temp0 = (cc0) ^ ((m0) & (m1));\
-	  temp1 = (cc1) ^ ((m4) & (m5));\
-	  m0 ^= ((m2) & (m3));          \
-	  m4 ^= ((m6) & (m7));          \
-	  m3 ^= ((~(m1)) & (m2));       \
-	  m7 ^= ((~(m5)) & (m6));       \
-	  m1 ^= ((m0) & (m2));          \
-	  m5 ^= ((m4) & (m6));          \
-	  m2 ^= ((m0) & (~(m3)));       \
-	  m6 ^= ((m4) & (~(m7)));       \
-	  m0 ^= ((m1) | (m3));          \
-	  m4 ^= ((m5) | (m7));          \
-	  m3 ^= ((m1) & (m2));          \
-	  m7 ^= ((m5) & (m6));          \
-	  m1 ^= (temp0 & (m0));         \
-	  m5 ^= (temp1 & (m4));         \
-	  m2 ^= temp0;                  \
-	  m6 ^= temp1;
+#define SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \
+	m3 = ~(m3);                                      \
+	m7 = ~(m7);                                      \
+	m0 ^= ((~(m2)) & (cc0));                         \
+	m4 ^= ((~(m6)) & (cc1));                         \
+	temp0 = (cc0) ^ ((m0) & (m1));                   \
+	temp1 = (cc1) ^ ((m4) & (m5));                   \
+	m0 ^= ((m2) & (m3));                             \
+	m4 ^= ((m6) & (m7));                             \
+	m3 ^= ((~(m1)) & (m2));                          \
+	m7 ^= ((~(m5)) & (m6));                          \
+	m1 ^= ((m0) & (m2));                             \
+	m5 ^= ((m4) & (m6));                             \
+	m2 ^= ((m0) & (~(m3)));                          \
+	m6 ^= ((m4) & (~(m7)));                          \
+	m0 ^= ((m1) | (m3));                             \
+	m4 ^= ((m5) | (m7));                             \
+	m3 ^= ((m1) & (m2));                             \
+	m7 ^= ((m5) & (m6));                             \
+	m1 ^= (temp0 & (m0));                            \
+	m5 ^= (temp1 & (m4));                            \
+	m2 ^= temp0;                                     \
+	m6 ^= temp1;
 
 /*The bijective function E8, in bitslice form*/
-static void E8(hashState *state)
+static void E8(hashState* state)
 {
-	  uint64 i,roundnumber,temp0,temp1;
-
-	  for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) {
-			/*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP1(state->x[1][i]); SWAP1(state->x[3][i]); SWAP1(state->x[5][i]); SWAP1(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP2(state->x[1][i]); SWAP2(state->x[3][i]); SWAP2(state->x[5][i]); SWAP2(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP4(state->x[1][i]); SWAP4(state->x[3][i]); SWAP4(state->x[5][i]); SWAP4(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP8(state->x[1][i]); SWAP8(state->x[3][i]); SWAP8(state->x[5][i]); SWAP8(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP16(state->x[1][i]); SWAP16(state->x[3][i]); SWAP16(state->x[5][i]); SWAP16(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP32(state->x[1][i]); SWAP32(state->x[3][i]); SWAP32(state->x[5][i]); SWAP32(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+6: Sbox and MDS layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			}
-			/*round 7*roundnumber+6: swapping layer*/
-			for (i = 1; i < 8; i = i+2) {
-				  temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0;
-			}
-	  }
-
+	uint64 i, roundnumber, temp0, temp1;
+
+	for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7)
+	{
+		/*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP1(state->x[1][i]);
+			SWAP1(state->x[3][i]);
+			SWAP1(state->x[5][i]);
+			SWAP1(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP2(state->x[1][i]);
+			SWAP2(state->x[3][i]);
+			SWAP2(state->x[5][i]);
+			SWAP2(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP4(state->x[1][i]);
+			SWAP4(state->x[3][i]);
+			SWAP4(state->x[5][i]);
+			SWAP4(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP8(state->x[1][i]);
+			SWAP8(state->x[3][i]);
+			SWAP8(state->x[5][i]);
+			SWAP8(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP16(state->x[1][i]);
+			SWAP16(state->x[3][i]);
+			SWAP16(state->x[5][i]);
+			SWAP16(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP32(state->x[1][i]);
+			SWAP32(state->x[3][i]);
+			SWAP32(state->x[5][i]);
+			SWAP32(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+6: Sbox and MDS layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+		}
+		/*round 7*roundnumber+6: swapping layer*/
+		for(i = 1; i < 8; i = i + 2)
+		{
+			temp0 = state->x[i][0];
+			state->x[i][0] = state->x[i][1];
+			state->x[i][1] = temp0;
+		}
+	}
 }
 
 /*The compression function F8 */
-static void F8(hashState *state)
+static void F8(hashState* state)
 {
-	  uint64  i;
+	uint64 i;
 
-	  /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
-	  for (i = 0; i < 8; i++)  state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i];
+	/*xor the 512-bit message with the fist half of the 1024-bit hash state*/
+	for(i = 0; i < 8; i++)
+		state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i];
 
-	  /*the bijective function E8 */
-	  E8(state);
+	/*the bijective function E8 */
+	E8(state);
 
-	  /*xor the 512-bit message with the second half of the 1024-bit hash state*/
-	  for (i = 0; i < 8; i++)  state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64*)state->buffer)[i];
+	/*xor the 512-bit message with the second half of the 1024-bit hash state*/
+	for(i = 0; i < 8; i++)
+		state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64*)state->buffer)[i];
 }
 
 /*before hashing a message, initialize the hash state as H0 */
-static HashReturn Init(hashState *state, int hashbitlen)
+static HashReturn Init(hashState* state, int hashbitlen)
 {
-	  state->databitlen = 0;
-	  state->datasize_in_buffer = 0;
-
-	  /*initialize the initial hash value of JH*/
-	  state->hashbitlen = hashbitlen;
-
-	  /*load the initial hash value into state*/
-	  switch (hashbitlen)
-	  {
-			case 224: memcpy(state->x,JH224_H0,128); break;
-			case 256: memcpy(state->x,JH256_H0,128); break;
-			case 384: memcpy(state->x,JH384_H0,128); break;
-			case 512: memcpy(state->x,JH512_H0,128); break;
-	  }
-
-	  return(SUCCESS);
+	state->databitlen = 0;
+	state->datasize_in_buffer = 0;
+
+	/*initialize the initial hash value of JH*/
+	state->hashbitlen = hashbitlen;
+
+	/*load the initial hash value into state*/
+	switch(hashbitlen)
+	{
+	case 224:
+		memcpy(state->x, JH224_H0, 128);
+		break;
+	case 256:
+		memcpy(state->x, JH256_H0, 128);
+		break;
+	case 384:
+		memcpy(state->x, JH384_H0, 128);
+		break;
+	case 512:
+		memcpy(state->x, JH512_H0, 128);
+		break;
+	}
+
+	return (SUCCESS);
 }
 
-
 /*hash each 512-bit message block, except the last partial block*/
-static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen)
 {
-	  DataLength index; /*the starting address of the data to be compressed*/
-
-	  state->databitlen += databitlen;
-	  index = 0;
-
-	  /*if there is remaining data in the buffer, fill it to a full message block first*/
-	  /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
-
-	  /*There is data in the buffer, but the incoming data is insufficient for a full block*/
-	  if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) {
-			if ( (databitlen & 7) == 0 ) {
-				 memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ;
-		    }
-			else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ;
-			state->datasize_in_buffer += databitlen;
-			databitlen = 0;
-	  }
-
-	  /*There is data in the buffer, and the incoming data is sufficient for a full block*/
-	  if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
-	        memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
-	        index = 64-(state->datasize_in_buffer >> 3);
-	        databitlen = databitlen - (512 - state->datasize_in_buffer);
-	        F8(state);
-	        state->datasize_in_buffer = 0;
-	  }
-
-	  /*hash the remaining full message blocks*/
-	  for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
-			memcpy(state->buffer, data+index, 64);
-			F8(state);
-	  }
-
-	  /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
-	  if ( databitlen > 0) {
-			if ((databitlen & 7) == 0)
-				  memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
-			else
-				  memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
-			state->datasize_in_buffer = databitlen;
-	  }
-
-	  return(SUCCESS);
+	DataLength index; /*the starting address of the data to be compressed*/
+
+	state->databitlen += databitlen;
+	index = 0;
+
+	/*if there is remaining data in the buffer, fill it to a full message block first*/
+	/*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
+
+	/*There is data in the buffer, but the incoming data is insufficient for a full block*/
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512))
+	{
+		if((databitlen & 7) == 0)
+		{
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
+		}
+		else
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1);
+		state->datasize_in_buffer += databitlen;
+		databitlen = 0;
+	}
+
+	/*There is data in the buffer, and the incoming data is sufficient for a full block*/
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512))
+	{
+		memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
+		index = 64 - (state->datasize_in_buffer >> 3);
+		databitlen = databitlen - (512 - state->datasize_in_buffer);
+		F8(state);
+		state->datasize_in_buffer = 0;
+	}
+
+	/*hash the remaining full message blocks*/
+	for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512)
+	{
+		memcpy(state->buffer, data + index, 64);
+		F8(state);
+	}
+
+	/*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
+	if(databitlen > 0)
+	{
+		if((databitlen & 7) == 0)
+			memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3);
+		else
+			memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1);
+		state->datasize_in_buffer = databitlen;
+	}
+
+	return (SUCCESS);
 }
 
 /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
-static HashReturn Final(hashState *state, BitSequence *hashval)
+static HashReturn Final(hashState* state, BitSequence* hashval)
 {
-	  unsigned int i;
-
-	  if ( (state->databitlen & 0x1ff) == 0 ) {
-			/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
-			memset(state->buffer, 0, 64);
-			state->buffer[0]  = 0x80;
-			state->buffer[63] = state->databitlen & 0xff;
-			state->buffer[62] = (state->databitlen >> 8)  & 0xff;
-			state->buffer[61] = (state->databitlen >> 16) & 0xff;
-			state->buffer[60] = (state->databitlen >> 24) & 0xff;
-			state->buffer[59] = (state->databitlen >> 32) & 0xff;
-			state->buffer[58] = (state->databitlen >> 40) & 0xff;
-			state->buffer[57] = (state->databitlen >> 48) & 0xff;
-			state->buffer[56] = (state->databitlen >> 56) & 0xff;
-			F8(state);
-	  }
-	  else {
-		    /*set the rest of the bytes in the buffer to 0*/
-			if ( (state->datasize_in_buffer & 7) == 0)
-				  for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)  state->buffer[i] = 0;
-			else
-				  for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)  state->buffer[i] = 0;
-
-			/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
-			state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
-
-			F8(state);
-			memset(state->buffer, 0, 64);
-			state->buffer[63] = state->databitlen & 0xff;
-			state->buffer[62] = (state->databitlen >> 8) & 0xff;
-			state->buffer[61] = (state->databitlen >> 16) & 0xff;
-			state->buffer[60] = (state->databitlen >> 24) & 0xff;
-			state->buffer[59] = (state->databitlen >> 32) & 0xff;
-			state->buffer[58] = (state->databitlen >> 40) & 0xff;
-			state->buffer[57] = (state->databitlen >> 48) & 0xff;
-			state->buffer[56] = (state->databitlen >> 56) & 0xff;
-			F8(state);
-	  }
-
-	  /*truncating the final hash value to generate the message digest*/
-	  switch(state->hashbitlen) {
-			case 224: memcpy(hashval,(unsigned char*)state->x+64+36,28);  break;
-			case 256: memcpy(hashval,(unsigned char*)state->x+64+32,32);  break;
-			case 384: memcpy(hashval,(unsigned char*)state->x+64+16,48);  break;
-			case 512: memcpy(hashval,(unsigned char*)state->x+64,64);     break;
-	  }
-
-	  return(SUCCESS);
+	unsigned int i;
+
+	if((state->databitlen & 0x1ff) == 0)
+	{
+		/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
+		memset(state->buffer, 0, 64);
+		state->buffer[0] = 0x80;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		F8(state);
+	}
+	else
+	{
+		/*set the rest of the bytes in the buffer to 0*/
+		if((state->datasize_in_buffer & 7) == 0)
+			for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
+				state->buffer[i] = 0;
+		else
+			for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++)
+				state->buffer[i] = 0;
+
+		/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
+		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7));
+
+		F8(state);
+		memset(state->buffer, 0, 64);
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		F8(state);
+	}
+
+	/*truncating the final hash value to generate the message digest*/
+	switch(state->hashbitlen)
+	{
+	case 224:
+		memcpy(hashval, (unsigned char*)state->x + 64 + 36, 28);
+		break;
+	case 256:
+		memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32);
+		break;
+	case 384:
+		memcpy(hashval, (unsigned char*)state->x + 64 + 16, 48);
+		break;
+	case 512:
+		memcpy(hashval, (unsigned char*)state->x + 64, 64);
+		break;
+	}
+
+	return (SUCCESS);
 }
 
 /* hash a message,
    three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen)
    one output:   message digest (hashval)
 */
-HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval)
+HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval)
 {
-	  hashState state;
-
-	  if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) {
-			Init(&state, hashbitlen);
-			Update(&state, data, databitlen);
-			Final(&state, hashval);
-			return SUCCESS;
-	  }
-	  else
-			return(BAD_HASHLEN);
+	hashState state;
+
+	if(hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512)
+	{
+		Init(&state, hashbitlen);
+		Update(&state, data, databitlen);
+		Final(&state, hashval);
+		return SUCCESS;
+	}
+	else
+		return (BAD_HASHLEN);
 }
diff --git a/xmrstak/backend/cpu/crypto/c_jh.h b/xmrstak/backend/cpu/crypto/c_jh.h
index d10d40fe5..34d30e6b4 100644
--- a/xmrstak/backend/cpu/crypto/c_jh.h
+++ b/xmrstak/backend/cpu/crypto/c_jh.h
@@ -16,4 +16,4 @@
 
 #include "hash.h"
 
-HashReturn jh_hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval);
diff --git a/xmrstak/backend/cpu/crypto/c_keccak.c b/xmrstak/backend/cpu/crypto/c_keccak.c
index 63c16147d..0af6b02ef 100644
--- a/xmrstak/backend/cpu/crypto/c_keccak.c
+++ b/xmrstak/backend/cpu/crypto/c_keccak.c
@@ -2,8 +2,8 @@
 // 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
 // A baseline Keccak (3rd round) implementation.
 
-#include <stdint.h>
 #include <memory.h>
+#include <stdint.h>
 
 #define HASH_DATA_AREA 136
 #define KECCAK_ROUNDS 24
@@ -13,16 +13,15 @@
 #endif
 
 const uint64_t keccakf_rndc[24] =
-{
-	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
-};
+	{
+		0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+		0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+		0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+		0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+		0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+		0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+		0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+		0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
 
 // update the state with given number of rounds
 
@@ -31,7 +30,8 @@ void keccakf(uint64_t st[25], int rounds)
 	int i, j, round;
 	uint64_t t, bc[5];
 
-	for (round = 0; round < rounds; ++round) {
+	for(round = 0; round < rounds; ++round)
+	{
 
 		// Theta
 		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
@@ -40,10 +40,11 @@ void keccakf(uint64_t st[25], int rounds)
 		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
 		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
 
-		for (i = 0; i < 5; ++i) {
+		for(i = 0; i < 5; ++i)
+		{
 			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
-			st[i     ] ^= t;
-			st[i +  5] ^= t;
+			st[i] ^= t;
+			st[i + 5] ^= t;
 			st[i + 10] ^= t;
 			st[i + 15] ^= t;
 			st[i + 20] ^= t;
@@ -51,81 +52,81 @@ void keccakf(uint64_t st[25], int rounds)
 
 		// Rho Pi
 		t = st[1];
-		st[ 1] = ROTL64(st[ 6], 44);
-		st[ 6] = ROTL64(st[ 9], 20);
-		st[ 9] = ROTL64(st[22], 61);
+		st[1] = ROTL64(st[6], 44);
+		st[6] = ROTL64(st[9], 20);
+		st[9] = ROTL64(st[22], 61);
 		st[22] = ROTL64(st[14], 39);
 		st[14] = ROTL64(st[20], 18);
-		st[20] = ROTL64(st[ 2], 62);
-		st[ 2] = ROTL64(st[12], 43);
+		st[20] = ROTL64(st[2], 62);
+		st[2] = ROTL64(st[12], 43);
 		st[12] = ROTL64(st[13], 25);
-		st[13] = ROTL64(st[19],  8);
+		st[13] = ROTL64(st[19], 8);
 		st[19] = ROTL64(st[23], 56);
 		st[23] = ROTL64(st[15], 41);
-		st[15] = ROTL64(st[ 4], 27);
-		st[ 4] = ROTL64(st[24], 14);
-		st[24] = ROTL64(st[21],  2);
-		st[21] = ROTL64(st[ 8], 55);
-		st[ 8] = ROTL64(st[16], 45);
-		st[16] = ROTL64(st[ 5], 36);
-		st[ 5] = ROTL64(st[ 3], 28);
-		st[ 3] = ROTL64(st[18], 21);
+		st[15] = ROTL64(st[4], 27);
+		st[4] = ROTL64(st[24], 14);
+		st[24] = ROTL64(st[21], 2);
+		st[21] = ROTL64(st[8], 55);
+		st[8] = ROTL64(st[16], 45);
+		st[16] = ROTL64(st[5], 36);
+		st[5] = ROTL64(st[3], 28);
+		st[3] = ROTL64(st[18], 21);
 		st[18] = ROTL64(st[17], 15);
 		st[17] = ROTL64(st[11], 10);
-		st[11] = ROTL64(st[ 7],  6);
-		st[ 7] = ROTL64(st[10],  3);
+		st[11] = ROTL64(st[7], 6);
+		st[7] = ROTL64(st[10], 3);
 		st[10] = ROTL64(t, 1);
 
 		//  Chi
 		// unrolled loop, where only last iteration is different
 		j = 0;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 5;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 10;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 15;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 20;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 		bc[2] = st[j + 2];
 		bc[3] = st[j + 3];
 		bc[4] = st[j + 4];
 
-		st[j    ] ^= (~bc[1]) & bc[2];
+		st[j] ^= (~bc[1]) & bc[2];
 		st[j + 1] ^= (~bc[2]) & bc[3];
 		st[j + 2] ^= (~bc[3]) & bc[4];
 		st[j + 3] ^= (~bc[4]) & bc[0];
@@ -139,7 +140,7 @@ void keccakf(uint64_t st[25], int rounds)
 // compute a keccak hash (md) of given byte length from "in"
 typedef uint64_t state_t[25];
 
-void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
+void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen)
 {
 	state_t st;
 	uint8_t temp[144];
@@ -150,9 +151,10 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
 
 	memset(st, 0, sizeof(st));
 
-	for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) {
-		for (i = 0; i < rsizw; i++)
-			st[i] ^= ((uint64_t *) in)[i];
+	for(; inlen >= rsiz; inlen -= rsiz, in += rsiz)
+	{
+		for(i = 0; i < rsizw; i++)
+			st[i] ^= ((uint64_t*)in)[i];
 		keccakf(st, KECCAK_ROUNDS);
 	}
 
@@ -162,15 +164,15 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
 	memset(temp + inlen, 0, rsiz - inlen);
 	temp[rsiz - 1] |= 0x80;
 
-	for (i = 0; i < rsizw; i++)
-		st[i] ^= ((uint64_t *) temp)[i];
+	for(i = 0; i < rsizw; i++)
+		st[i] ^= ((uint64_t*)temp)[i];
 
 	keccakf(st, KECCAK_ROUNDS);
 
 	memcpy(md, st, mdlen);
 }
 
-void keccak1600(const uint8_t *in, int inlen, uint8_t *md)
+void keccak1600(const uint8_t* in, int inlen, uint8_t* md)
 {
 	keccak(in, inlen, md, sizeof(state_t));
 }
diff --git a/xmrstak/backend/cpu/crypto/c_keccak.h b/xmrstak/backend/cpu/crypto/c_keccak.h
index 4f7f85729..b7a26065e 100644
--- a/xmrstak/backend/cpu/crypto/c_keccak.h
+++ b/xmrstak/backend/cpu/crypto/c_keccak.h
@@ -16,11 +16,11 @@
 #endif
 
 // compute a keccak hash (md) of given byte length from "in"
-int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
+int keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen);
 
 // update the state
 void keccakf(uint64_t st[25], int norounds);
 
-void keccak1600(const uint8_t *in, int inlen, uint8_t *md);
+void keccak1600(const uint8_t* in, int inlen, uint8_t* md);
 
 #endif
diff --git a/xmrstak/backend/cpu/crypto/c_skein.c b/xmrstak/backend/cpu/crypto/c_skein.c
index e2d54425f..4b8cbb388 100644
--- a/xmrstak/backend/cpu/crypto/c_skein.c
+++ b/xmrstak/backend/cpu/crypto/c_skein.c
@@ -8,11 +8,11 @@
 **
 ************************************************************************/
 
-#define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
+#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
 
-#include <stddef.h>                          /* get size_t definition */
-#include <string.h>      /* get the memcpy/memset functions */
-#include "c_skein.h"       /* get the Skein API definitions   */
+#include "c_skein.h" /* get the Skein API definitions   */
+#include <stddef.h>  /* get size_t definition */
+#include <string.h>  /* get the memcpy/memset functions */
 
 #define DISABLE_UNUSED 0
 
@@ -24,72 +24,72 @@
 #define SKEIN_512_NIST_MAX_HASHBITS (512)
 #endif
 
-#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */
 
-#define  SKEIN_256_STATE_WORDS ( 4)
-#define  SKEIN_512_STATE_WORDS ( 8)
-#define  SKEIN1024_STATE_WORDS (16)
-#define  SKEIN_MAX_STATE_WORDS (16)
+#define SKEIN_256_STATE_WORDS (4)
+#define SKEIN_512_STATE_WORDS (8)
+#define SKEIN1024_STATE_WORDS (16)
+#define SKEIN_MAX_STATE_WORDS (16)
 
-#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
-#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
-#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS)
 
-#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_RND_SPECIAL       (1000u)
-#define SKEIN_RND_KEY_INITIAL   (SKEIN_RND_SPECIAL+0u)
-#define SKEIN_RND_KEY_INJECT    (SKEIN_RND_SPECIAL+1u)
-#define SKEIN_RND_FEED_FWD      (SKEIN_RND_SPECIAL+2u)
+#define SKEIN_RND_SPECIAL (1000u)
+#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL + 0u)
+#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL + 1u)
+#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL + 2u)
 
 typedef struct
 {
-  size_t  hashBitLen;                      /* size of hash result, in bits */
-  size_t  bCnt;                            /* current byte count in buffer b[] */
-  u64b_t  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
+	size_t hashBitLen;				/* size of hash result, in bits */
+	size_t bCnt;					/* current byte count in buffer b[] */
+	u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
 } Skein_Ctxt_Hdr_t;
 
-typedef struct                               /*  256-bit Skein hash context structure */
+typedef struct /*  256-bit Skein hash context structure */
 {
-  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
-  u64b_t  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
-  u08b_t  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+	Skein_Ctxt_Hdr_t h;				 /* common header context variables */
+	u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */
+	u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
 } Skein_256_Ctxt_t;
 
-typedef struct                               /*  512-bit Skein hash context structure */
+typedef struct /*  512-bit Skein hash context structure */
 {
-  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
-  u64b_t  X[SKEIN_512_STATE_WORDS];        /* chaining variables */
-  u08b_t  b[SKEIN_512_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+	Skein_Ctxt_Hdr_t h;				 /* common header context variables */
+	u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */
+	u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
 } Skein_512_Ctxt_t;
 
-typedef struct                               /* 1024-bit Skein hash context structure */
+typedef struct /* 1024-bit Skein hash context structure */
 {
-  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
-  u64b_t  X[SKEIN1024_STATE_WORDS];        /* chaining variables */
-  u08b_t  b[SKEIN1024_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+	Skein_Ctxt_Hdr_t h;				 /* common header context variables */
+	u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */
+	u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
 } Skein1024_Ctxt_t;
 
 /*   Skein APIs for (incremental) "straight hashing" */
 #if SKEIN_256_NIST_MAX_HASH_BITS
-static int  Skein_256_Init  (Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+static int Skein_256_Init(Skein_256_Ctxt_t* ctx, size_t hashBitLen);
 #endif
-static int  Skein_512_Init  (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
-static int  Skein1024_Init  (Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen);
+static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen);
 
-static int  Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
-static int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
-static int  Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt);
+static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt);
+static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt);
 
-static int  Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
-static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
-static int  Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal);
+static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal);
+static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal);
 
 /*
 **   Skein APIs for "extended" initialization: MAC keys, tree hashing.
@@ -126,7 +126,7 @@ static int  Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 #define SKEIN_TREE_HASH (1)
 #endif
 #if 0
-#if  SKEIN_TREE_HASH
+#if SKEIN_TREE_HASH
 static int  Skein_256_Output   (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
 static int  Skein_512_Output   (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
 static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
@@ -142,128 +142,146 @@ static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 ******************************************************************/
 
 /* tweak word T[1]: bit field starting positions */
-#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word  */
 
-#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
-#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
-#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
-#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
-#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree       */
+#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119)  /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126)	/* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127)	/* bit  127     : final block flag         */
 
 /* tweak word T[1]: flag bit definition(s) */
-#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
-#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
-#define SKEIN_T1_FLAG_BIT_PAD   (((u64b_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+#define SKEIN_T1_FLAG_FIRST (((u64b_t)1) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL (((u64b_t)1) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t)1) << SKEIN_T1_POS_BIT_PAD)
 
 /* tweak word T[1]: tree level bit field mask */
-#define SKEIN_T1_TREE_LVL_MASK  (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
-#define SKEIN_T1_TREE_LEVEL(n)  (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t)(n)) << SKEIN_T1_POS_TREE_LVL)
 
 /* tweak word T[1]: block type field */
-#define SKEIN_BLK_TYPE_KEY      ( 0)                    /* key, for MAC and KDF */
-#define SKEIN_BLK_TYPE_CFG      ( 4)                    /* configuration block */
-#define SKEIN_BLK_TYPE_PERS     ( 8)                    /* personalization string */
-#define SKEIN_BLK_TYPE_PK       (12)                    /* public key (for digital signature hashing) */
-#define SKEIN_BLK_TYPE_KDF      (16)                    /* key identifier for KDF */
-#define SKEIN_BLK_TYPE_NONCE    (20)                    /* nonce for PRNG */
-#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
-#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
-#define SKEIN_BLK_TYPE_MASK     (63)                    /* bit field mask */
-
-#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
-#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* key, for MAC and KDF */
-#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* configuration block */
-#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
-#define SKEIN_T1_BLK_TYPE_PK    SKEIN_T1_BLK_TYPE(PK)   /* public key (for digital signature hashing) */
-#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key identifier for KDF */
-#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
-#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
-#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
-#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
-
-#define SKEIN_T1_BLK_TYPE_CFG_FINAL       (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
-#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
-
-#define SKEIN_VERSION           (1)
-
-#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
-#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
-#endif
-
-#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
-#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
-#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
-
-#define SKEIN_CFG_STR_LEN       (4*8)
+#define SKEIN_BLK_TYPE_KEY (0)	/* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG (4)	/* configuration block */
+#define SKEIN_BLK_TYPE_PERS (8)   /* personalization string */
+#define SKEIN_BLK_TYPE_PK (12)	/* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF (16)   /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG (48)   /* message processing */
+#define SKEIN_BLK_TYPE_OUT (63)   /* output stage */
+#define SKEIN_BLK_TYPE_MASK (63)  /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T) (((u64b_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY)	 /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG)	 /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS)   /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK)		 /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF)	 /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE) /* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG)	 /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT)	 /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK)   /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION (1)
+
+#ifndef SKEIN_ID_STRING_LE				/* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((u64b_t)(hi32)) << 32))
+#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN (4 * 8)
 
 /* bit field definitions in config block treeInfo word */
-#define SKEIN_CFG_TREE_LEAF_SIZE_POS  ( 0)
-#define SKEIN_CFG_TREE_NODE_SIZE_POS  ( 8)
-#define SKEIN_CFG_TREE_MAX_LEVEL_POS  (16)
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS (0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS (8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16)
 
-#define SKEIN_CFG_TREE_LEAF_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
-#define SKEIN_CFG_TREE_NODE_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
-#define SKEIN_CFG_TREE_MAX_LEVEL_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
 
-#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl)                   \
-  ( (((u64b_t)(leaf  )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
-  (((u64b_t)(node  )) << SKEIN_CFG_TREE_NODE_SIZE_POS) |    \
-  (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+#define SKEIN_CFG_TREE_INFO(leaf, node, maxLvl)              \
+	((((u64b_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
+		(((u64b_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \
+		(((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS))
 
-#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0, 0, 0) /* use as treeInfo in InitExt() call for sequential processing */
 
 /*
 **   Skein macros for getting/setting tweak words, etc.
 **   These are useful for partial input bytes, hash tree init/update, etc.
 **/
-#define Skein_Get_Tweak(ctxPtr,TWK_NUM)         ((ctxPtr)->h.T[TWK_NUM])
-#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+#define Skein_Get_Tweak(ctxPtr, TWK_NUM) ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \
+	{                                          \
+		(ctxPtr)->h.T[TWK_NUM] = (tVal);       \
+	}
 
-#define Skein_Get_T0(ctxPtr)    Skein_Get_Tweak(ctxPtr,0)
-#define Skein_Get_T1(ctxPtr)    Skein_Get_Tweak(ctxPtr,1)
-#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
-#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0)
+#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr, 1)
+#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0)
+#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1)
 
 /* set both tweak words at once */
-#define Skein_Set_T0_T1(ctxPtr,T0,T1)           \
-{                                           \
-  Skein_Set_T0(ctxPtr,(T0));                  \
-  Skein_Set_T1(ctxPtr,(T1));                  \
-}
+#define Skein_Set_T0_T1(ctxPtr, T0, T1) \
+	{                                   \
+		Skein_Set_T0(ctxPtr, (T0));     \
+		Skein_Set_T1(ctxPtr, (T1));     \
+	}
 
-#define Skein_Set_Type(ctxPtr,BLK_TYPE)         \
-  Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+#define Skein_Set_Type(ctxPtr, BLK_TYPE) \
+	Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE)
 
 /* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
-#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
-{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+#define Skein_Start_New_Type(ctxPtr, BLK_TYPE)                                          \
+	{                                                                                   \
+		Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \
+		(ctxPtr)->h.bCnt = 0;                                                           \
+	}
 
-#define Skein_Clear_First_Flag(hdr)      { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;       }
-#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+#define Skein_Clear_First_Flag(hdr)         \
+	{                                       \
+		(hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \
+	}
+#define Skein_Set_Bit_Pad_Flag(hdr)          \
+	{                                        \
+		(hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \
+	}
 
-#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+#define Skein_Set_Tree_Level(hdr, height)          \
+	{                                              \
+		(hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \
+	}
 
 /*****************************************************************
 ** "Internal" Skein definitions for debugging and error checking
 ******************************************************************/
-#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
-#define Skein_Show_Round(bits,ctx,r,X)
-#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
-#define Skein_Show_Final(bits,ctx,cnt,outPtr)
-#define Skein_Show_Key(bits,ctx,key,keyBytes)
-
-
-#ifndef SKEIN_ERR_CHECK        /* run-time checks (e.g., bad params, uninitialized context)? */
-#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr)
+#define Skein_Show_Round(bits, ctx, r, X)
+#define Skein_Show_R_Ptr(bits, ctx, r, X_ptr)
+#define Skein_Show_Final(bits, ctx, cnt, outPtr)
+#define Skein_Show_Key(bits, ctx, key, keyBytes)
+
+#ifndef SKEIN_ERR_CHECK			 /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x, retCode) /* default: ignore all Asserts, for performance */
 #define Skein_assert(x)
-#elif   defined(SKEIN_ASSERT)
+#elif defined(SKEIN_ASSERT)
 #include <assert.h>
-#define Skein_Assert(x,retCode) assert(x)
-#define Skein_assert(x)         assert(x)
+#define Skein_Assert(x, retCode) assert(x)
+#define Skein_assert(x) assert(x)
 #else
 #include <assert.h>
-#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
-#define Skein_assert(x)         assert(x)                     /* internal error */
+#define Skein_Assert(x, retCode) \
+	{                            \
+		if(!(x))                 \
+			return retCode;      \
+	}							  /*  caller  error */
+#define Skein_assert(x) assert(x) /* internal error */
 #endif
 
 /*****************************************************************
@@ -271,48 +289,135 @@ static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 ******************************************************************/
 enum
 {
-  /* Skein_256 round rotation constants */
-  R_256_0_0=14, R_256_0_1=16,
-  R_256_1_0=52, R_256_1_1=57,
-  R_256_2_0=23, R_256_2_1=40,
-  R_256_3_0= 5, R_256_3_1=37,
-  R_256_4_0=25, R_256_4_1=33,
-  R_256_5_0=46, R_256_5_1=12,
-  R_256_6_0=58, R_256_6_1=22,
-  R_256_7_0=32, R_256_7_1=32,
-
-  /* Skein_512 round rotation constants */
-  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
-  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
-  R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
-  R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
-  R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
-  R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
-  R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
-  R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
-
-  /* Skein1024 round rotation constants */
-  R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
-  R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
-  R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
-  R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
-  R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
-  R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
-  R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
-  R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
+	/* Skein_256 round rotation constants */
+	R_256_0_0 = 14,
+	R_256_0_1 = 16,
+	R_256_1_0 = 52,
+	R_256_1_1 = 57,
+	R_256_2_0 = 23,
+	R_256_2_1 = 40,
+	R_256_3_0 = 5,
+	R_256_3_1 = 37,
+	R_256_4_0 = 25,
+	R_256_4_1 = 33,
+	R_256_5_0 = 46,
+	R_256_5_1 = 12,
+	R_256_6_0 = 58,
+	R_256_6_1 = 22,
+	R_256_7_0 = 32,
+	R_256_7_1 = 32,
+
+	/* Skein_512 round rotation constants */
+	R_512_0_0 = 46,
+	R_512_0_1 = 36,
+	R_512_0_2 = 19,
+	R_512_0_3 = 37,
+	R_512_1_0 = 33,
+	R_512_1_1 = 27,
+	R_512_1_2 = 14,
+	R_512_1_3 = 42,
+	R_512_2_0 = 17,
+	R_512_2_1 = 49,
+	R_512_2_2 = 36,
+	R_512_2_3 = 39,
+	R_512_3_0 = 44,
+	R_512_3_1 = 9,
+	R_512_3_2 = 54,
+	R_512_3_3 = 56,
+	R_512_4_0 = 39,
+	R_512_4_1 = 30,
+	R_512_4_2 = 34,
+	R_512_4_3 = 24,
+	R_512_5_0 = 13,
+	R_512_5_1 = 50,
+	R_512_5_2 = 10,
+	R_512_5_3 = 17,
+	R_512_6_0 = 25,
+	R_512_6_1 = 29,
+	R_512_6_2 = 39,
+	R_512_6_3 = 43,
+	R_512_7_0 = 8,
+	R_512_7_1 = 35,
+	R_512_7_2 = 56,
+	R_512_7_3 = 22,
+
+	/* Skein1024 round rotation constants */
+	R1024_0_0 = 24,
+	R1024_0_1 = 13,
+	R1024_0_2 = 8,
+	R1024_0_3 = 47,
+	R1024_0_4 = 8,
+	R1024_0_5 = 17,
+	R1024_0_6 = 22,
+	R1024_0_7 = 37,
+	R1024_1_0 = 38,
+	R1024_1_1 = 19,
+	R1024_1_2 = 10,
+	R1024_1_3 = 55,
+	R1024_1_4 = 49,
+	R1024_1_5 = 18,
+	R1024_1_6 = 23,
+	R1024_1_7 = 52,
+	R1024_2_0 = 33,
+	R1024_2_1 = 4,
+	R1024_2_2 = 51,
+	R1024_2_3 = 13,
+	R1024_2_4 = 34,
+	R1024_2_5 = 41,
+	R1024_2_6 = 59,
+	R1024_2_7 = 17,
+	R1024_3_0 = 5,
+	R1024_3_1 = 20,
+	R1024_3_2 = 48,
+	R1024_3_3 = 41,
+	R1024_3_4 = 47,
+	R1024_3_5 = 28,
+	R1024_3_6 = 16,
+	R1024_3_7 = 25,
+	R1024_4_0 = 41,
+	R1024_4_1 = 9,
+	R1024_4_2 = 37,
+	R1024_4_3 = 31,
+	R1024_4_4 = 12,
+	R1024_4_5 = 47,
+	R1024_4_6 = 44,
+	R1024_4_7 = 30,
+	R1024_5_0 = 16,
+	R1024_5_1 = 34,
+	R1024_5_2 = 56,
+	R1024_5_3 = 51,
+	R1024_5_4 = 4,
+	R1024_5_5 = 53,
+	R1024_5_6 = 42,
+	R1024_5_7 = 41,
+	R1024_6_0 = 31,
+	R1024_6_1 = 44,
+	R1024_6_2 = 47,
+	R1024_6_3 = 46,
+	R1024_6_4 = 19,
+	R1024_6_5 = 42,
+	R1024_6_6 = 44,
+	R1024_6_7 = 25,
+	R1024_7_0 = 9,
+	R1024_7_1 = 48,
+	R1024_7_2 = 35,
+	R1024_7_3 = 52,
+	R1024_7_4 = 23,
+	R1024_7_5 = 31,
+	R1024_7_6 = 37,
+	R1024_7_7 = 20
 };
 
 #ifndef SKEIN_ROUNDS
-#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
+#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */
 #define SKEIN_512_ROUNDS_TOTAL (72)
 #define SKEIN1024_ROUNDS_TOTAL (80)
-#else                                        /* allow command-line define in range 8*(5..14)   */
-#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5))
-#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
-#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS    ) + 5) % 10) + 5))
+#else /* allow command-line define in range 8*(5..14)   */
+#define SKEIN_256_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 100) + 5) % 10) + 5))
+#define SKEIN_512_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 10) + 5) % 10) + 5))
+#define SKEIN1024_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS) + 5) % 10) + 5))
 #endif
 
-
 /*
 ***************** Pre-computed Skein IVs *******************
 **
@@ -332,239 +437,233 @@ enum
 /* blkSize =  256 bits. hashSize =  128 bits */
 const u64b_t SKEIN_256_IV_128[] =
 	{
-	MK_64(0xE1111906,0x964D7260),
-	MK_64(0x883DAAA7,0x7C8D811C),
-	MK_64(0x10080DF4,0x91960F7A),
-	MK_64(0xCCF7DDE5,0xB45BC1C2)
-	};
+		MK_64(0xE1111906, 0x964D7260),
+		MK_64(0x883DAAA7, 0x7C8D811C),
+		MK_64(0x10080DF4, 0x91960F7A),
+		MK_64(0xCCF7DDE5, 0xB45BC1C2)};
 
 /* blkSize =  256 bits. hashSize =  160 bits */
 const u64b_t SKEIN_256_IV_160[] =
 	{
-	MK_64(0x14202314,0x72825E98),
-	MK_64(0x2AC4E9A2,0x5A77E590),
-	MK_64(0xD47A5856,0x8838D63E),
-	MK_64(0x2DD2E496,0x8586AB7D)
-	};
+		MK_64(0x14202314, 0x72825E98),
+		MK_64(0x2AC4E9A2, 0x5A77E590),
+		MK_64(0xD47A5856, 0x8838D63E),
+		MK_64(0x2DD2E496, 0x8586AB7D)};
 
 /* blkSize =  256 bits. hashSize =  224 bits */
 const u64b_t SKEIN_256_IV_224[] =
 	{
-	MK_64(0xC6098A8C,0x9AE5EA0B),
-	MK_64(0x876D5686,0x08C5191C),
-	MK_64(0x99CB88D7,0xD7F53884),
-	MK_64(0x384BDDB1,0xAEDDB5DE)
-	};
+		MK_64(0xC6098A8C, 0x9AE5EA0B),
+		MK_64(0x876D5686, 0x08C5191C),
+		MK_64(0x99CB88D7, 0xD7F53884),
+		MK_64(0x384BDDB1, 0xAEDDB5DE)};
 
 /* blkSize =  256 bits. hashSize =  256 bits */
 const u64b_t SKEIN_256_IV_256[] =
 	{
-	MK_64(0xFC9DA860,0xD048B449),
-	MK_64(0x2FCA6647,0x9FA7D833),
-	MK_64(0xB33BC389,0x6656840F),
-	MK_64(0x6A54E920,0xFDE8DA69)
-	};
+		MK_64(0xFC9DA860, 0xD048B449),
+		MK_64(0x2FCA6647, 0x9FA7D833),
+		MK_64(0xB33BC389, 0x6656840F),
+		MK_64(0x6A54E920, 0xFDE8DA69)};
 
 /* blkSize =  512 bits. hashSize =  128 bits */
 const u64b_t SKEIN_512_IV_128[] =
 	{
-	MK_64(0xA8BC7BF3,0x6FBF9F52),
-	MK_64(0x1E9872CE,0xBD1AF0AA),
-	MK_64(0x309B1790,0xB32190D3),
-	MK_64(0xBCFBB854,0x3F94805C),
-	MK_64(0x0DA61BCD,0x6E31B11B),
-	MK_64(0x1A18EBEA,0xD46A32E3),
-	MK_64(0xA2CC5B18,0xCE84AA82),
-	MK_64(0x6982AB28,0x9D46982D)
-	};
+		MK_64(0xA8BC7BF3, 0x6FBF9F52),
+		MK_64(0x1E9872CE, 0xBD1AF0AA),
+		MK_64(0x309B1790, 0xB32190D3),
+		MK_64(0xBCFBB854, 0x3F94805C),
+		MK_64(0x0DA61BCD, 0x6E31B11B),
+		MK_64(0x1A18EBEA, 0xD46A32E3),
+		MK_64(0xA2CC5B18, 0xCE84AA82),
+		MK_64(0x6982AB28, 0x9D46982D)};
 
 /* blkSize =  512 bits. hashSize =  160 bits */
 const u64b_t SKEIN_512_IV_160[] =
 	{
-	MK_64(0x28B81A2A,0xE013BD91),
-	MK_64(0xC2F11668,0xB5BDF78F),
-	MK_64(0x1760D8F3,0xF6A56F12),
-	MK_64(0x4FB74758,0x8239904F),
-	MK_64(0x21EDE07F,0x7EAF5056),
-	MK_64(0xD908922E,0x63ED70B8),
-	MK_64(0xB8EC76FF,0xECCB52FA),
-	MK_64(0x01A47BB8,0xA3F27A6E)
-	};
+		MK_64(0x28B81A2A, 0xE013BD91),
+		MK_64(0xC2F11668, 0xB5BDF78F),
+		MK_64(0x1760D8F3, 0xF6A56F12),
+		MK_64(0x4FB74758, 0x8239904F),
+		MK_64(0x21EDE07F, 0x7EAF5056),
+		MK_64(0xD908922E, 0x63ED70B8),
+		MK_64(0xB8EC76FF, 0xECCB52FA),
+		MK_64(0x01A47BB8, 0xA3F27A6E)};
 
 /* blkSize =  512 bits. hashSize =  224 bits */
 const u64b_t SKEIN_512_IV_224[] =
 	{
-	MK_64(0xCCD06162,0x48677224),
-	MK_64(0xCBA65CF3,0xA92339EF),
-	MK_64(0x8CCD69D6,0x52FF4B64),
-	MK_64(0x398AED7B,0x3AB890B4),
-	MK_64(0x0F59D1B1,0x457D2BD0),
-	MK_64(0x6776FE65,0x75D4EB3D),
-	MK_64(0x99FBC70E,0x997413E9),
-	MK_64(0x9E2CFCCF,0xE1C41EF7)
-	};
+		MK_64(0xCCD06162, 0x48677224),
+		MK_64(0xCBA65CF3, 0xA92339EF),
+		MK_64(0x8CCD69D6, 0x52FF4B64),
+		MK_64(0x398AED7B, 0x3AB890B4),
+		MK_64(0x0F59D1B1, 0x457D2BD0),
+		MK_64(0x6776FE65, 0x75D4EB3D),
+		MK_64(0x99FBC70E, 0x997413E9),
+		MK_64(0x9E2CFCCF, 0xE1C41EF7)};
 
 /* blkSize =  512 bits. hashSize =  256 bits */
 const u64b_t SKEIN_512_IV_256[] =
 	{
-	MK_64(0xCCD044A1,0x2FDB3E13),
-	MK_64(0xE8359030,0x1A79A9EB),
-	MK_64(0x55AEA061,0x4F816E6F),
-	MK_64(0x2A2767A4,0xAE9B94DB),
-	MK_64(0xEC06025E,0x74DD7683),
-	MK_64(0xE7A436CD,0xC4746251),
-	MK_64(0xC36FBAF9,0x393AD185),
-	MK_64(0x3EEDBA18,0x33EDFC13)
-	};
+		MK_64(0xCCD044A1, 0x2FDB3E13),
+		MK_64(0xE8359030, 0x1A79A9EB),
+		MK_64(0x55AEA061, 0x4F816E6F),
+		MK_64(0x2A2767A4, 0xAE9B94DB),
+		MK_64(0xEC06025E, 0x74DD7683),
+		MK_64(0xE7A436CD, 0xC4746251),
+		MK_64(0xC36FBAF9, 0x393AD185),
+		MK_64(0x3EEDBA18, 0x33EDFC13)};
 
 /* blkSize =  512 bits. hashSize =  384 bits */
 const u64b_t SKEIN_512_IV_384[] =
 	{
-	MK_64(0xA3F6C6BF,0x3A75EF5F),
-	MK_64(0xB0FEF9CC,0xFD84FAA4),
-	MK_64(0x9D77DD66,0x3D770CFE),
-	MK_64(0xD798CBF3,0xB468FDDA),
-	MK_64(0x1BC4A666,0x8A0E4465),
-	MK_64(0x7ED7D434,0xE5807407),
-	MK_64(0x548FC1AC,0xD4EC44D6),
-	MK_64(0x266E1754,0x6AA18FF8)
-	};
+		MK_64(0xA3F6C6BF, 0x3A75EF5F),
+		MK_64(0xB0FEF9CC, 0xFD84FAA4),
+		MK_64(0x9D77DD66, 0x3D770CFE),
+		MK_64(0xD798CBF3, 0xB468FDDA),
+		MK_64(0x1BC4A666, 0x8A0E4465),
+		MK_64(0x7ED7D434, 0xE5807407),
+		MK_64(0x548FC1AC, 0xD4EC44D6),
+		MK_64(0x266E1754, 0x6AA18FF8)};
 
 /* blkSize =  512 bits. hashSize =  512 bits */
 const u64b_t SKEIN_512_IV_512[] =
 	{
-	MK_64(0x4903ADFF,0x749C51CE),
-	MK_64(0x0D95DE39,0x9746DF03),
-	MK_64(0x8FD19341,0x27C79BCE),
-	MK_64(0x9A255629,0xFF352CB1),
-	MK_64(0x5DB62599,0xDF6CA7B0),
-	MK_64(0xEABE394C,0xA9D5C3F4),
-	MK_64(0x991112C7,0x1A75B523),
-	MK_64(0xAE18A40B,0x660FCC33)
-	};
+		MK_64(0x4903ADFF, 0x749C51CE),
+		MK_64(0x0D95DE39, 0x9746DF03),
+		MK_64(0x8FD19341, 0x27C79BCE),
+		MK_64(0x9A255629, 0xFF352CB1),
+		MK_64(0x5DB62599, 0xDF6CA7B0),
+		MK_64(0xEABE394C, 0xA9D5C3F4),
+		MK_64(0x991112C7, 0x1A75B523),
+		MK_64(0xAE18A40B, 0x660FCC33)};
 
 /* blkSize = 1024 bits. hashSize =  384 bits */
 const u64b_t SKEIN1024_IV_384[] =
 	{
-	MK_64(0x5102B6B8,0xC1894A35),
-	MK_64(0xFEEBC9E3,0xFE8AF11A),
-	MK_64(0x0C807F06,0xE32BED71),
-	MK_64(0x60C13A52,0xB41A91F6),
-	MK_64(0x9716D35D,0xD4917C38),
-	MK_64(0xE780DF12,0x6FD31D3A),
-	MK_64(0x797846B6,0xC898303A),
-	MK_64(0xB172C2A8,0xB3572A3B),
-	MK_64(0xC9BC8203,0xA6104A6C),
-	MK_64(0x65909338,0xD75624F4),
-	MK_64(0x94BCC568,0x4B3F81A0),
-	MK_64(0x3EBBF51E,0x10ECFD46),
-	MK_64(0x2DF50F0B,0xEEB08542),
-	MK_64(0x3B5A6530,0x0DBC6516),
-	MK_64(0x484B9CD2,0x167BBCE1),
-	MK_64(0x2D136947,0xD4CBAFEA)
-	};
+		MK_64(0x5102B6B8, 0xC1894A35),
+		MK_64(0xFEEBC9E3, 0xFE8AF11A),
+		MK_64(0x0C807F06, 0xE32BED71),
+		MK_64(0x60C13A52, 0xB41A91F6),
+		MK_64(0x9716D35D, 0xD4917C38),
+		MK_64(0xE780DF12, 0x6FD31D3A),
+		MK_64(0x797846B6, 0xC898303A),
+		MK_64(0xB172C2A8, 0xB3572A3B),
+		MK_64(0xC9BC8203, 0xA6104A6C),
+		MK_64(0x65909338, 0xD75624F4),
+		MK_64(0x94BCC568, 0x4B3F81A0),
+		MK_64(0x3EBBF51E, 0x10ECFD46),
+		MK_64(0x2DF50F0B, 0xEEB08542),
+		MK_64(0x3B5A6530, 0x0DBC6516),
+		MK_64(0x484B9CD2, 0x167BBCE1),
+		MK_64(0x2D136947, 0xD4CBAFEA)};
 
 /* blkSize = 1024 bits. hashSize =  512 bits */
 const u64b_t SKEIN1024_IV_512[] =
 	{
-	MK_64(0xCAEC0E5D,0x7C1B1B18),
-	MK_64(0xA01B0E04,0x5F03E802),
-	MK_64(0x33840451,0xED912885),
-	MK_64(0x374AFB04,0xEAEC2E1C),
-	MK_64(0xDF25A0E2,0x813581F7),
-	MK_64(0xE4004093,0x8B12F9D2),
-	MK_64(0xA662D539,0xC2ED39B6),
-	MK_64(0xFA8B85CF,0x45D8C75A),
-	MK_64(0x8316ED8E,0x29EDE796),
-	MK_64(0x053289C0,0x2E9F91B8),
-	MK_64(0xC3F8EF1D,0x6D518B73),
-	MK_64(0xBDCEC3C4,0xD5EF332E),
-	MK_64(0x549A7E52,0x22974487),
-	MK_64(0x67070872,0x5B749816),
-	MK_64(0xB9CD28FB,0xF0581BD1),
-	MK_64(0x0E2940B8,0x15804974)
-	};
+		MK_64(0xCAEC0E5D, 0x7C1B1B18),
+		MK_64(0xA01B0E04, 0x5F03E802),
+		MK_64(0x33840451, 0xED912885),
+		MK_64(0x374AFB04, 0xEAEC2E1C),
+		MK_64(0xDF25A0E2, 0x813581F7),
+		MK_64(0xE4004093, 0x8B12F9D2),
+		MK_64(0xA662D539, 0xC2ED39B6),
+		MK_64(0xFA8B85CF, 0x45D8C75A),
+		MK_64(0x8316ED8E, 0x29EDE796),
+		MK_64(0x053289C0, 0x2E9F91B8),
+		MK_64(0xC3F8EF1D, 0x6D518B73),
+		MK_64(0xBDCEC3C4, 0xD5EF332E),
+		MK_64(0x549A7E52, 0x22974487),
+		MK_64(0x67070872, 0x5B749816),
+		MK_64(0xB9CD28FB, 0xF0581BD1),
+		MK_64(0x0E2940B8, 0x15804974)};
 
 /* blkSize = 1024 bits. hashSize = 1024 bits */
 const u64b_t SKEIN1024_IV_1024[] =
 	{
-	MK_64(0xD593DA07,0x41E72355),
-	MK_64(0x15B5E511,0xAC73E00C),
-	MK_64(0x5180E5AE,0xBAF2C4F0),
-	MK_64(0x03BD41D3,0xFCBCAFAF),
-	MK_64(0x1CAEC6FD,0x1983A898),
-	MK_64(0x6E510B8B,0xCDD0589F),
-	MK_64(0x77E2BDFD,0xC6394ADA),
-	MK_64(0xC11E1DB5,0x24DCB0A3),
-	MK_64(0xD6D14AF9,0xC6329AB5),
-	MK_64(0x6A9B0BFC,0x6EB67E0D),
-	MK_64(0x9243C60D,0xCCFF1332),
-	MK_64(0x1A1F1DDE,0x743F02D4),
-	MK_64(0x0996753C,0x10ED0BB8),
-	MK_64(0x6572DD22,0xF2B4969A),
-	MK_64(0x61FD3062,0xD00A579A),
-	MK_64(0x1DE0536E,0x8682E539)
-	};
-
+		MK_64(0xD593DA07, 0x41E72355),
+		MK_64(0x15B5E511, 0xAC73E00C),
+		MK_64(0x5180E5AE, 0xBAF2C4F0),
+		MK_64(0x03BD41D3, 0xFCBCAFAF),
+		MK_64(0x1CAEC6FD, 0x1983A898),
+		MK_64(0x6E510B8B, 0xCDD0589F),
+		MK_64(0x77E2BDFD, 0xC6394ADA),
+		MK_64(0xC11E1DB5, 0x24DCB0A3),
+		MK_64(0xD6D14AF9, 0xC6329AB5),
+		MK_64(0x6A9B0BFC, 0x6EB67E0D),
+		MK_64(0x9243C60D, 0xCCFF1332),
+		MK_64(0x1A1F1DDE, 0x743F02D4),
+		MK_64(0x0996753C, 0x10ED0BB8),
+		MK_64(0x6572DD22, 0xF2B4969A),
+		MK_64(0x61FD3062, 0xD00A579A),
+		MK_64(0x1DE0536E, 0x8682E539)};
 
 #ifndef SKEIN_USE_ASM
-#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
+#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
 #endif
 
 #ifndef SKEIN_LOOP
-#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
+#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
 #endif
 
-#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
-#define KW_TWK_BASE     (0)
-#define KW_KEY_BASE     (3)
-#define ks              (kw + KW_KEY_BASE)
-#define ts              (kw + KW_TWK_BASE)
+#define BLK_BITS (WCNT * 64) /* some useful definitions for code here */
+#define KW_TWK_BASE (0)
+#define KW_KEY_BASE (3)
+#define ks (kw + KW_KEY_BASE)
+#define ts (kw + KW_TWK_BASE)
 
 #ifdef SKEIN_DEBUG
-#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
+#define DebugSaveTweak(ctx)  \
+	{                        \
+		ctx->h.T[0] = ts[0]; \
+		ctx->h.T[1] = ts[1]; \
+	}
 #else
 #define DebugSaveTweak(ctx)
 #endif
 
 /*****************************  Skein_256 ******************************/
 #if !(SKEIN_USE_ASM & 256)
-static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
-	{ /* do it in C */
+static void Skein_256_Process_Block(Skein_256_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd)
+{ /* do it in C */
 	enum
-		{
+	{
 		WCNT = SKEIN_256_STATE_WORDS
-		};
-#undef  RCNT
-#define RCNT  (SKEIN_256_ROUNDS_TOTAL/8)
+	};
+#undef RCNT
+#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8)
 
-#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
-#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
 #else
 #define SKEIN_UNROLL_256 (0)
 #endif
 
 #if SKEIN_UNROLL_256
-#if (RCNT % SKEIN_UNROLL_256)
+#if(RCNT % SKEIN_UNROLL_256)
 #error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
 #endif
-	size_t  r;
-	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+	size_t r;
+	u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/
 #else
-	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+	u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
 #endif
-	u64b_t  X0,X1,X2,X3;                        /* local copy of context vars, for speed */
-	u64b_t  w [WCNT];                           /* local copy of input block */
+	u64b_t X0, X1, X2, X3; /* local copy of context vars, for speed */
+	u64b_t w[WCNT];		   /* local copy of input block */
 #ifdef SKEIN_DEBUG
-	const u64b_t *Xptr[4];                      /* use for debugging (help compiler put Xn in registers) */
-	Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+	const u64b_t* Xptr[4]; /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
 #endif
-	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
-	do  {
+	do
+	{
 		/* this implementation only supports 2**64 input bytes (no carry out here) */
-		ts[0] += byteCntAdd;                    /* update processed length */
+		ts[0] += byteCntAdd; /* update processed length */
 
 		/* precompute the key schedule for this block */
 		ks[0] = ctx->X[0];
@@ -575,114 +674,118 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s
 
 		ts[2] = ts[0] ^ ts[1];
 
-		Skein_Get64_LSB_First(w,blkPtr,WCNT);   /* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 		DebugSaveTweak(ctx);
-		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 
-		X0 = w[0] + ks[0];                      /* do the first full key injection */
+		X0 = w[0] + ks[0]; /* do the first full key injection */
 		X1 = w[1] + ks[1] + ts[0];
 		X2 = w[2] + ks[2] + ts[1];
 		X3 = w[3] + ks[3];
 
-		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);    /* show starting state values */
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* show starting state values */
 
 		blkPtr += SKEIN_256_BLOCK_BYTES;
 
 		/* run the rounds */
 
-#define Round256(p0,p1,p2,p3,ROT,rNum)                              \
-	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
-	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+#define Round256(p0, p1, p2, p3, ROT, rNum) \
+	X##p0 += X##p1;                         \
+	X##p1 = RotL_64(X##p1, ROT##_0);        \
+	X##p1 ^= X##p0;                         \
+	X##p2 += X##p3;                         \
+	X##p3 = RotL_64(X##p3, ROT##_1);        \
+	X##p3 ^= X##p2;
 
 #if SKEIN_UNROLL_256 == 0
-#define R256(p0,p1,p2,p3,ROT,rNum)           /* fully unrolled */   \
-	Round256(p0,p1,p2,p3,ROT,rNum)                                  \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
-
-#define I256(R)                                                     \
-	X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
-	X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
-	X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
-	X3   += ks[((R)+4) % 5] +     (R)+1;                            \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-#else                                       /* looping version */
-#define R256(p0,p1,p2,p3,ROT,rNum)                                  \
-	Round256(p0,p1,p2,p3,ROT,rNum)                                  \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
-
-#define I256(R)                                                     \
-	X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
-	X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
-	X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
-	X3   += ks[r+(R)+3] +    r+(R)   ;                              \
-	ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
-	ts[r + (R)+2    ]   = ts[r+(R)-1];                              \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-
-	for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256)  /* loop thru it */
+#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
+	Round256(p0, p1, p2, p3, ROT, rNum)                      \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define I256(R)                                                  \
+	X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
+	X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];                 \
+	X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];                 \
+	X3 += ks[((R) + 4) % 5] + (R) + 1;                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R256(p0, p1, p2, p3, ROT, rNum) \
+	Round256(p0, p1, p2, p3, ROT, rNum) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define I256(R)                                                \
+	X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+	X1 += ks[r + (R) + 1] + ts[r + (R) + 0];                   \
+	X2 += ks[r + (R) + 2] + ts[r + (R) + 1];                   \
+	X3 += ks[r + (R) + 3] + r + (R);                           \
+	ks[r + (R) + 4] = ks[r + (R)-1]; /* rotate key schedule */ \
+	ts[r + (R) + 2] = ts[r + (R)-1];                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) /* loop thru it */
 #endif
 		{
-#define R256_8_rounds(R)                  \
-		R256(0,1,2,3,R_256_0,8*(R) + 1);  \
-		R256(0,3,2,1,R_256_1,8*(R) + 2);  \
-		R256(0,1,2,3,R_256_2,8*(R) + 3);  \
-		R256(0,3,2,1,R_256_3,8*(R) + 4);  \
-		I256(2*(R));                      \
-		R256(0,1,2,3,R_256_4,8*(R) + 5);  \
-		R256(0,3,2,1,R_256_5,8*(R) + 6);  \
-		R256(0,1,2,3,R_256_6,8*(R) + 7);  \
-		R256(0,3,2,1,R_256_7,8*(R) + 8);  \
-		I256(2*(R)+1);
-
-		R256_8_rounds( 0);
-
-#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
-
-  #if   R256_Unroll_R( 1)
-		R256_8_rounds( 1);
-  #endif
-  #if   R256_Unroll_R( 2)
-		R256_8_rounds( 2);
-  #endif
-  #if   R256_Unroll_R( 3)
-		R256_8_rounds( 3);
-  #endif
-  #if   R256_Unroll_R( 4)
-		R256_8_rounds( 4);
-  #endif
-  #if   R256_Unroll_R( 5)
-		R256_8_rounds( 5);
-  #endif
-  #if   R256_Unroll_R( 6)
-		R256_8_rounds( 6);
-  #endif
-  #if   R256_Unroll_R( 7)
-		R256_8_rounds( 7);
-  #endif
-  #if   R256_Unroll_R( 8)
-		R256_8_rounds( 8);
-  #endif
-  #if   R256_Unroll_R( 9)
-		R256_8_rounds( 9);
-  #endif
-  #if   R256_Unroll_R(10)
-		R256_8_rounds(10);
-  #endif
-  #if   R256_Unroll_R(11)
-		R256_8_rounds(11);
-  #endif
-  #if   R256_Unroll_R(12)
-		R256_8_rounds(12);
-  #endif
-  #if   R256_Unroll_R(13)
-		R256_8_rounds(13);
-  #endif
-  #if   R256_Unroll_R(14)
-		R256_8_rounds(14);
-  #endif
-  #if  (SKEIN_UNROLL_256 > 14)
-#error  "need more unrolling in Skein_256_Process_Block"
-  #endif
+#define R256_8_rounds(R)                    \
+	R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \
+	R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \
+	R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \
+	R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \
+	I256(2 * (R));                          \
+	R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \
+	R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \
+	R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \
+	R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \
+	I256(2 * (R) + 1);
+
+			R256_8_rounds(0);
+
+#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
+
+#if R256_Unroll_R(1)
+			R256_8_rounds(1);
+#endif
+#if R256_Unroll_R(2)
+			R256_8_rounds(2);
+#endif
+#if R256_Unroll_R(3)
+			R256_8_rounds(3);
+#endif
+#if R256_Unroll_R(4)
+			R256_8_rounds(4);
+#endif
+#if R256_Unroll_R(5)
+			R256_8_rounds(5);
+#endif
+#if R256_Unroll_R(6)
+			R256_8_rounds(6);
+#endif
+#if R256_Unroll_R(7)
+			R256_8_rounds(7);
+#endif
+#if R256_Unroll_R(8)
+			R256_8_rounds(8);
+#endif
+#if R256_Unroll_R(9)
+			R256_8_rounds(9);
+#endif
+#if R256_Unroll_R(10)
+			R256_8_rounds(10);
+#endif
+#if R256_Unroll_R(11)
+			R256_8_rounds(11);
+#endif
+#if R256_Unroll_R(12)
+			R256_8_rounds(12);
+#endif
+#if R256_Unroll_R(13)
+			R256_8_rounds(13);
+#endif
+#if R256_Unroll_R(14)
+			R256_8_rounds(14);
+#endif
+#if(SKEIN_UNROLL_256 > 14)
+#error "need more unrolling in Skein_256_Process_Block"
+#endif
 		}
 		/* do the final "feedforward" xor, update context chaining vars */
 		ctx->X[0] = X0 ^ w[0];
@@ -690,68 +793,74 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ctx->X[2] = X2 ^ w[2];
 		ctx->X[3] = X3 ^ w[3];
 
-		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
-		}
-	while (--blkCnt);
+	} while(--blkCnt);
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
-	}
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_256_Process_Block_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_256_Process_Block_CodeSize) -
-		   ((u08b_t *) Skein_256_Process_Block);
-	}
+{
+	return ((u08b_t*)Skein_256_Process_Block_CodeSize) -
+		   ((u08b_t*)Skein_256_Process_Block);
+}
 static uint_t Skein_256_Unroll_Cnt(void)
-	{
+{
 	return SKEIN_UNROLL_256;
-	}
+}
 #endif
 #endif
 
 /*****************************  Skein_512 ******************************/
 #if !(SKEIN_USE_ASM & 512)
-static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
-	{ /* do it in C */
+static void Skein_512_Process_Block(Skein_512_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd)
+{ /* do it in C */
 	enum
-		{
+	{
 		WCNT = SKEIN_512_STATE_WORDS
-		};
-#undef  RCNT
-#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+	};
+#undef RCNT
+#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8)
 
-#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
-#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
 #else
 #define SKEIN_UNROLL_512 (0)
 #endif
 
 #if SKEIN_UNROLL_512
-#if (RCNT % SKEIN_UNROLL_512)
+#if(RCNT % SKEIN_UNROLL_512)
 #error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
 #endif
-	size_t  r;
-	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+	size_t r;
+	u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/
 #else
-	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+	u64b_t kw[WCNT + 4];									/* key schedule words : chaining vars + tweak */
 #endif
-	u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
-	u64b_t  w [WCNT];                           /* local copy of input block */
+	u64b_t X0, X1, X2, X3, X4, X5, X6, X7; /* local copy of vars, for speed */
+	u64b_t w[WCNT];						   /* local copy of input block */
 #ifdef SKEIN_DEBUG
-	const u64b_t *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
-	Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
-	Xptr[4] = &X4;  Xptr[5] = &X5;  Xptr[6] = &X6;  Xptr[7] = &X7;
+	const u64b_t* Xptr[8]; /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+	Xptr[4] = &X4;
+	Xptr[5] = &X5;
+	Xptr[6] = &X6;
+	Xptr[7] = &X7;
 #endif
 
-	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
-	do  {
+	do
+	{
 		/* this implementation only supports 2**64 input bytes (no carry out here) */
-		ts[0] += byteCntAdd;                    /* update processed length */
+		ts[0] += byteCntAdd; /* update processed length */
 
 		/* precompute the key schedule for this block */
 		ks[0] = ctx->X[0];
@@ -767,126 +876,134 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
 
 		ts[2] = ts[0] ^ ts[1];
 
-		Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 		DebugSaveTweak(ctx);
-		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 
-		X0   = w[0] + ks[0];                    /* do the first full key injection */
-		X1   = w[1] + ks[1];
-		X2   = w[2] + ks[2];
-		X3   = w[3] + ks[3];
-		X4   = w[4] + ks[4];
-		X5   = w[5] + ks[5] + ts[0];
-		X6   = w[6] + ks[6] + ts[1];
-		X7   = w[7] + ks[7];
+		X0 = w[0] + ks[0]; /* do the first full key injection */
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
 
 		blkPtr += SKEIN_512_BLOCK_BYTES;
 
-		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr);
 		/* run the rounds */
-#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
-	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
-	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
-	X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
-	X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+	X##p0 += X##p1;                                         \
+	X##p1 = RotL_64(X##p1, ROT##_0);                        \
+	X##p1 ^= X##p0;                                         \
+	X##p2 += X##p3;                                         \
+	X##p3 = RotL_64(X##p3, ROT##_1);                        \
+	X##p3 ^= X##p2;                                         \
+	X##p4 += X##p5;                                         \
+	X##p5 = RotL_64(X##p5, ROT##_2);                        \
+	X##p5 ^= X##p4;                                         \
+	X##p6 += X##p7;                                         \
+	X##p7 = RotL_64(X##p7, ROT##_3);                        \
+	X##p7 ^= X##p6;
 
 #if SKEIN_UNROLL_512 == 0
-#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
-	Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
-
-#define I512(R)                                                     \
-	X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
-	X1   += ks[((R)+2) % 9];                                        \
-	X2   += ks[((R)+3) % 9];                                        \
-	X3   += ks[((R)+4) % 9];                                        \
-	X4   += ks[((R)+5) % 9];                                        \
-	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
-	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
-	X7   += ks[((R)+8) % 9] +     (R)+1;                            \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-#else                                       /* looping version */
-#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
-	Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
-
-#define I512(R)                                                     \
-	X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
-	X1   += ks[r+(R)+1];                                            \
-	X2   += ks[r+(R)+2];                                            \
-	X3   += ks[r+(R)+3];                                            \
-	X4   += ks[r+(R)+4];                                            \
-	X5   += ks[r+(R)+5] + ts[r+(R)+0];                              \
-	X6   += ks[r+(R)+6] + ts[r+(R)+1];                              \
-	X7   += ks[r+(R)+7] +    r+(R)   ;                              \
-	ks[r +       (R)+8] = ks[r+(R)-1];  /* rotate key schedule */   \
-	ts[r +       (R)+2] = ts[r+(R)-1];                              \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-
-	for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512)   /* loop thru it */
-#endif                         /* end of looped code definitions */
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define I512(R)                                                  \
+	X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */ \
+	X1 += ks[((R) + 2) % 9];                                     \
+	X2 += ks[((R) + 3) % 9];                                     \
+	X3 += ks[((R) + 4) % 9];                                     \
+	X4 += ks[((R) + 5) % 9];                                     \
+	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];                 \
+	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];                 \
+	X7 += ks[((R) + 8) % 9] + (R) + 1;                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define I512(R)                                                \
+	X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+	X1 += ks[r + (R) + 1];                                     \
+	X2 += ks[r + (R) + 2];                                     \
+	X3 += ks[r + (R) + 3];                                     \
+	X4 += ks[r + (R) + 4];                                     \
+	X5 += ks[r + (R) + 5] + ts[r + (R) + 0];                   \
+	X6 += ks[r + (R) + 6] + ts[r + (R) + 1];                   \
+	X7 += ks[r + (R) + 7] + r + (R);                           \
+	ks[r + (R) + 8] = ks[r + (R)-1]; /* rotate key schedule */ \
+	ts[r + (R) + 2] = ts[r + (R)-1];                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) /* loop thru it */
+#endif /* end of looped code definitions */
 		{
-#define R512_8_rounds(R)  /* do 8 full rounds */  \
-		R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
-		R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
-		R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
-		R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
-		I512(2*(R));                              \
-		R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
-		R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
-		R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
-		R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
-		I512(2*(R)+1);        /* and key injection */
-
-		R512_8_rounds( 0);
-
-#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
-
-  #if   R512_Unroll_R( 1)
-		R512_8_rounds( 1);
-  #endif
-  #if   R512_Unroll_R( 2)
-		R512_8_rounds( 2);
-  #endif
-  #if   R512_Unroll_R( 3)
-		R512_8_rounds( 3);
-  #endif
-  #if   R512_Unroll_R( 4)
-		R512_8_rounds( 4);
-  #endif
-  #if   R512_Unroll_R( 5)
-		R512_8_rounds( 5);
-  #endif
-  #if   R512_Unroll_R( 6)
-		R512_8_rounds( 6);
-  #endif
-  #if   R512_Unroll_R( 7)
-		R512_8_rounds( 7);
-  #endif
-  #if   R512_Unroll_R( 8)
-		R512_8_rounds( 8);
-  #endif
-  #if   R512_Unroll_R( 9)
-		R512_8_rounds( 9);
-  #endif
-  #if   R512_Unroll_R(10)
-		R512_8_rounds(10);
-  #endif
-  #if   R512_Unroll_R(11)
-		R512_8_rounds(11);
-  #endif
-  #if   R512_Unroll_R(12)
-		R512_8_rounds(12);
-  #endif
-  #if   R512_Unroll_R(13)
-		R512_8_rounds(13);
-  #endif
-  #if   R512_Unroll_R(14)
-		R512_8_rounds(14);
-  #endif
-  #if  (SKEIN_UNROLL_512 > 14)
-#error  "need more unrolling in Skein_512_Process_Block"
-  #endif
+#define R512_8_rounds(R) /* do 8 full rounds */         \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
+	I512(2 * (R));                                      \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
+	I512(2 * (R) + 1); /* and key injection */
+
+			R512_8_rounds(0);
+
+#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
+
+#if R512_Unroll_R(1)
+			R512_8_rounds(1);
+#endif
+#if R512_Unroll_R(2)
+			R512_8_rounds(2);
+#endif
+#if R512_Unroll_R(3)
+			R512_8_rounds(3);
+#endif
+#if R512_Unroll_R(4)
+			R512_8_rounds(4);
+#endif
+#if R512_Unroll_R(5)
+			R512_8_rounds(5);
+#endif
+#if R512_Unroll_R(6)
+			R512_8_rounds(6);
+#endif
+#if R512_Unroll_R(7)
+			R512_8_rounds(7);
+#endif
+#if R512_Unroll_R(8)
+			R512_8_rounds(8);
+#endif
+#if R512_Unroll_R(9)
+			R512_8_rounds(9);
+#endif
+#if R512_Unroll_R(10)
+			R512_8_rounds(10);
+#endif
+#if R512_Unroll_R(11)
+			R512_8_rounds(11);
+#endif
+#if R512_Unroll_R(12)
+			R512_8_rounds(12);
+#endif
+#if R512_Unroll_R(13)
+			R512_8_rounds(13);
+#endif
+#if R512_Unroll_R(14)
+			R512_8_rounds(14);
+#endif
+#if(SKEIN_UNROLL_512 > 14)
+#error "need more unrolling in Skein_512_Process_Block"
+#endif
 		}
 
 		/* do the final "feedforward" xor, update context chaining vars */
@@ -898,256 +1015,284 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ctx->X[5] = X5 ^ w[5];
 		ctx->X[6] = X6 ^ w[6];
 		ctx->X[7] = X7 ^ w[7];
-		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
-		}
-	while (--blkCnt);
+	} while(--blkCnt);
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
-	}
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_512_Process_Block_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
-		   ((u08b_t *) Skein_512_Process_Block);
-	}
+{
+	return ((u08b_t*)Skein_512_Process_Block_CodeSize) -
+		   ((u08b_t*)Skein_512_Process_Block);
+}
 static uint_t Skein_512_Unroll_Cnt(void)
-	{
+{
 	return SKEIN_UNROLL_512;
-	}
+}
 #endif
 #endif
 
 /*****************************  Skein1024 ******************************/
 #if !(SKEIN_USE_ASM & 1024)
-static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
-	{ /* do it in C, always looping (unrolled is bigger AND slower!) */
+static void Skein1024_Process_Block(Skein1024_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd)
+{ /* do it in C, always looping (unrolled is bigger AND slower!) */
 	enum
-		{
+	{
 		WCNT = SKEIN1024_STATE_WORDS
-		};
-#undef  RCNT
-#define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+	};
+#undef RCNT
+#define RCNT (SKEIN1024_ROUNDS_TOTAL / 8)
 
-#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
-#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_1024 ((SKEIN_LOOP) % 10)
 #else
 #define SKEIN_UNROLL_1024 (0)
 #endif
 
-#if (SKEIN_UNROLL_1024 != 0)
-#if (RCNT % SKEIN_UNROLL_1024)
+#if(SKEIN_UNROLL_1024 != 0)
+#if(RCNT % SKEIN_UNROLL_1024)
 #error "Invalid SKEIN_UNROLL_1024"              /* sanity check on unroll count */
 #endif
-	size_t  r;
-	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+	size_t r;
+	u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/
 #else
-	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+	u64b_t kw[WCNT + 4];									/* key schedule words : chaining vars + tweak */
 #endif
 
-	u64b_t  X00,X01,X02,X03,X04,X05,X06,X07,    /* local copy of vars, for speed */
-			X08,X09,X10,X11,X12,X13,X14,X15;
-	u64b_t  w [WCNT];                           /* local copy of input block */
+	u64b_t X00, X01, X02, X03, X04, X05, X06, X07, /* local copy of vars, for speed */
+		X08, X09, X10, X11, X12, X13, X14, X15;
+	u64b_t w[WCNT]; /* local copy of input block */
 #ifdef SKEIN_DEBUG
-	const u64b_t *Xptr[16];                     /* use for debugging (help compiler put Xn in registers) */
-	Xptr[ 0] = &X00;  Xptr[ 1] = &X01;  Xptr[ 2] = &X02;  Xptr[ 3] = &X03;
-	Xptr[ 4] = &X04;  Xptr[ 5] = &X05;  Xptr[ 6] = &X06;  Xptr[ 7] = &X07;
-	Xptr[ 8] = &X08;  Xptr[ 9] = &X09;  Xptr[10] = &X10;  Xptr[11] = &X11;
-	Xptr[12] = &X12;  Xptr[13] = &X13;  Xptr[14] = &X14;  Xptr[15] = &X15;
+	const u64b_t* Xptr[16]; /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X00;
+	Xptr[1] = &X01;
+	Xptr[2] = &X02;
+	Xptr[3] = &X03;
+	Xptr[4] = &X04;
+	Xptr[5] = &X05;
+	Xptr[6] = &X06;
+	Xptr[7] = &X07;
+	Xptr[8] = &X08;
+	Xptr[9] = &X09;
+	Xptr[10] = &X10;
+	Xptr[11] = &X11;
+	Xptr[12] = &X12;
+	Xptr[13] = &X13;
+	Xptr[14] = &X14;
+	Xptr[15] = &X15;
 #endif
 
-	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
-	do  {
+	do
+	{
 		/* this implementation only supports 2**64 input bytes (no carry out here) */
-		ts[0] += byteCntAdd;                    /* update processed length */
+		ts[0] += byteCntAdd; /* update processed length */
 
 		/* precompute the key schedule for this block */
-		ks[ 0] = ctx->X[ 0];
-		ks[ 1] = ctx->X[ 1];
-		ks[ 2] = ctx->X[ 2];
-		ks[ 3] = ctx->X[ 3];
-		ks[ 4] = ctx->X[ 4];
-		ks[ 5] = ctx->X[ 5];
-		ks[ 6] = ctx->X[ 6];
-		ks[ 7] = ctx->X[ 7];
-		ks[ 8] = ctx->X[ 8];
-		ks[ 9] = ctx->X[ 9];
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ctx->X[8];
+		ks[9] = ctx->X[9];
 		ks[10] = ctx->X[10];
 		ks[11] = ctx->X[11];
 		ks[12] = ctx->X[12];
 		ks[13] = ctx->X[13];
 		ks[14] = ctx->X[14];
 		ks[15] = ctx->X[15];
-		ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^
-				 ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^
-				 ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^
+		ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+				 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
+				 ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
 				 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
 
-		ts[2]  = ts[0] ^ ts[1];
+		ts[2] = ts[0] ^ ts[1];
 
-		Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 		DebugSaveTweak(ctx);
-		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
-
-		X00    = w[ 0] + ks[ 0];                 /* do the first full key injection */
-		X01    = w[ 1] + ks[ 1];
-		X02    = w[ 2] + ks[ 2];
-		X03    = w[ 3] + ks[ 3];
-		X04    = w[ 4] + ks[ 4];
-		X05    = w[ 5] + ks[ 5];
-		X06    = w[ 6] + ks[ 6];
-		X07    = w[ 7] + ks[ 7];
-		X08    = w[ 8] + ks[ 8];
-		X09    = w[ 9] + ks[ 9];
-		X10    = w[10] + ks[10];
-		X11    = w[11] + ks[11];
-		X12    = w[12] + ks[12];
-		X13    = w[13] + ks[13] + ts[0];
-		X14    = w[14] + ks[14] + ts[1];
-		X15    = w[15] + ks[15];
-
-		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
-
-#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \
-	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0;   \
-	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2;   \
-	X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4;   \
-	X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6;   \
-	X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8;   \
-	X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA;   \
-	X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC;   \
-	X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE;   \
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X00 = w[0] + ks[0]; /* do the first full key injection */
+		X01 = w[1] + ks[1];
+		X02 = w[2] + ks[2];
+		X03 = w[3] + ks[3];
+		X04 = w[4] + ks[4];
+		X05 = w[5] + ks[5];
+		X06 = w[6] + ks[6];
+		X07 = w[7] + ks[7];
+		X08 = w[8] + ks[8];
+		X09 = w[9] + ks[9];
+		X10 = w[10] + ks[10];
+		X11 = w[11] + ks[11];
+		X12 = w[12] + ks[12];
+		X13 = w[13] + ks[13] + ts[0];
+		X14 = w[14] + ks[14] + ts[1];
+		X15 = w[15] + ks[15];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr);
+
+#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rNum) \
+	X##p0 += X##p1;                                                                          \
+	X##p1 = RotL_64(X##p1, ROT##_0);                                                         \
+	X##p1 ^= X##p0;                                                                          \
+	X##p2 += X##p3;                                                                          \
+	X##p3 = RotL_64(X##p3, ROT##_1);                                                         \
+	X##p3 ^= X##p2;                                                                          \
+	X##p4 += X##p5;                                                                          \
+	X##p5 = RotL_64(X##p5, ROT##_2);                                                         \
+	X##p5 ^= X##p4;                                                                          \
+	X##p6 += X##p7;                                                                          \
+	X##p7 = RotL_64(X##p7, ROT##_3);                                                         \
+	X##p7 ^= X##p6;                                                                          \
+	X##p8 += X##p9;                                                                          \
+	X##p9 = RotL_64(X##p9, ROT##_4);                                                         \
+	X##p9 ^= X##p8;                                                                          \
+	X##pA += X##pB;                                                                          \
+	X##pB = RotL_64(X##pB, ROT##_5);                                                         \
+	X##pB ^= X##pA;                                                                          \
+	X##pC += X##pD;                                                                          \
+	X##pD = RotL_64(X##pD, ROT##_6);                                                         \
+	X##pD ^= X##pC;                                                                          \
+	X##pE += X##pF;                                                                          \
+	X##pF = RotL_64(X##pF, ROT##_7);                                                         \
+	X##pF ^= X##pE;
 
 #if SKEIN_UNROLL_1024 == 0
-#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr);
-
-#define I1024(R)                                                      \
-	X00   += ks[((R)+ 1) % 17]; /* inject the key schedule value */   \
-	X01   += ks[((R)+ 2) % 17];                                       \
-	X02   += ks[((R)+ 3) % 17];                                       \
-	X03   += ks[((R)+ 4) % 17];                                       \
-	X04   += ks[((R)+ 5) % 17];                                       \
-	X05   += ks[((R)+ 6) % 17];                                       \
-	X06   += ks[((R)+ 7) % 17];                                       \
-	X07   += ks[((R)+ 8) % 17];                                       \
-	X08   += ks[((R)+ 9) % 17];                                       \
-	X09   += ks[((R)+10) % 17];                                       \
-	X10   += ks[((R)+11) % 17];                                       \
-	X11   += ks[((R)+12) % 17];                                       \
-	X12   += ks[((R)+13) % 17];                                       \
-	X13   += ks[((R)+14) % 17] + ts[((R)+1) % 3];                     \
-	X14   += ks[((R)+15) % 17] + ts[((R)+2) % 3];                     \
-	X15   += ks[((R)+16) % 17] +     (R)+1;                           \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-#else                                       /* looping version */
-#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr);
-
-#define I1024(R)                                                      \
-	X00   += ks[r+(R)+ 0];    /* inject the key schedule value */     \
-	X01   += ks[r+(R)+ 1];                                            \
-	X02   += ks[r+(R)+ 2];                                            \
-	X03   += ks[r+(R)+ 3];                                            \
-	X04   += ks[r+(R)+ 4];                                            \
-	X05   += ks[r+(R)+ 5];                                            \
-	X06   += ks[r+(R)+ 6];                                            \
-	X07   += ks[r+(R)+ 7];                                            \
-	X08   += ks[r+(R)+ 8];                                            \
-	X09   += ks[r+(R)+ 9];                                            \
-	X10   += ks[r+(R)+10];                                            \
-	X11   += ks[r+(R)+11];                                            \
-	X12   += ks[r+(R)+12];                                            \
-	X13   += ks[r+(R)+13] + ts[r+(R)+0];                              \
-	X14   += ks[r+(R)+14] + ts[r+(R)+1];                              \
-	X15   += ks[r+(R)+15] +    r+(R)   ;                              \
-	ks[r  +       (R)+16] = ks[r+(R)-1];  /* rotate key schedule */   \
-	ts[r  +       (R)+ 2] = ts[r+(R)-1];                              \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-
-	for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024)    /* loop thru it */
+#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
+
+#define I1024(R)                                                   \
+	X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */ \
+	X01 += ks[((R) + 2) % 17];                                     \
+	X02 += ks[((R) + 3) % 17];                                     \
+	X03 += ks[((R) + 4) % 17];                                     \
+	X04 += ks[((R) + 5) % 17];                                     \
+	X05 += ks[((R) + 6) % 17];                                     \
+	X06 += ks[((R) + 7) % 17];                                     \
+	X07 += ks[((R) + 8) % 17];                                     \
+	X08 += ks[((R) + 9) % 17];                                     \
+	X09 += ks[((R) + 10) % 17];                                    \
+	X10 += ks[((R) + 11) % 17];                                    \
+	X11 += ks[((R) + 12) % 17];                                    \
+	X12 += ks[((R) + 13) % 17];                                    \
+	X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];                \
+	X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];                \
+	X15 += ks[((R) + 16) % 17] + (R) + 1;                          \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
+
+#define I1024(R)                                                \
+	X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+	X01 += ks[r + (R) + 1];                                     \
+	X02 += ks[r + (R) + 2];                                     \
+	X03 += ks[r + (R) + 3];                                     \
+	X04 += ks[r + (R) + 4];                                     \
+	X05 += ks[r + (R) + 5];                                     \
+	X06 += ks[r + (R) + 6];                                     \
+	X07 += ks[r + (R) + 7];                                     \
+	X08 += ks[r + (R) + 8];                                     \
+	X09 += ks[r + (R) + 9];                                     \
+	X10 += ks[r + (R) + 10];                                    \
+	X11 += ks[r + (R) + 11];                                    \
+	X12 += ks[r + (R) + 12];                                    \
+	X13 += ks[r + (R) + 13] + ts[r + (R) + 0];                  \
+	X14 += ks[r + (R) + 14] + ts[r + (R) + 1];                  \
+	X15 += ks[r + (R) + 15] + r + (R);                          \
+	ks[r + (R) + 16] = ks[r + (R)-1]; /* rotate key schedule */ \
+	ts[r + (R) + 2] = ts[r + (R)-1];                            \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		for(r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) /* loop thru it */
 #endif
 		{
-#define R1024_8_rounds(R)    /* do 8 full rounds */                               \
-		R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \
-		R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \
-		R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \
-		R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \
-		I1024(2*(R));                                                             \
-		R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \
-		R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \
-		R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \
-		R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \
-		I1024(2*(R)+1);
-
-		R1024_8_rounds( 0);
-
-#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
-
-  #if   R1024_Unroll_R( 1)
-		R1024_8_rounds( 1);
-  #endif
-  #if   R1024_Unroll_R( 2)
-		R1024_8_rounds( 2);
-  #endif
-  #if   R1024_Unroll_R( 3)
-		R1024_8_rounds( 3);
-  #endif
-  #if   R1024_Unroll_R( 4)
-		R1024_8_rounds( 4);
-  #endif
-  #if   R1024_Unroll_R( 5)
-		R1024_8_rounds( 5);
-  #endif
-  #if   R1024_Unroll_R( 6)
-		R1024_8_rounds( 6);
-  #endif
-  #if   R1024_Unroll_R( 7)
-		R1024_8_rounds( 7);
-  #endif
-  #if   R1024_Unroll_R( 8)
-		R1024_8_rounds( 8);
-  #endif
-  #if   R1024_Unroll_R( 9)
-		R1024_8_rounds( 9);
-  #endif
-  #if   R1024_Unroll_R(10)
-		R1024_8_rounds(10);
-  #endif
-  #if   R1024_Unroll_R(11)
-		R1024_8_rounds(11);
-  #endif
-  #if   R1024_Unroll_R(12)
-		R1024_8_rounds(12);
-  #endif
-  #if   R1024_Unroll_R(13)
-		R1024_8_rounds(13);
-  #endif
-  #if   R1024_Unroll_R(14)
-		R1024_8_rounds(14);
-  #endif
-  #if  (SKEIN_UNROLL_1024 > 14)
-#error  "need more unrolling in Skein_1024_Process_Block"
-  #endif
+#define R1024_8_rounds(R) /* do 8 full rounds */                                                 \
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_0, 8 * (R) + 1); \
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_1, 8 * (R) + 2); \
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_2, 8 * (R) + 3); \
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_3, 8 * (R) + 4); \
+	I1024(2 * (R));                                                                              \
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_4, 8 * (R) + 5); \
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_5, 8 * (R) + 6); \
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_6, 8 * (R) + 7); \
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_7, 8 * (R) + 8); \
+	I1024(2 * (R) + 1);
+
+			R1024_8_rounds(0);
+
+#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
+
+#if R1024_Unroll_R(1)
+			R1024_8_rounds(1);
+#endif
+#if R1024_Unroll_R(2)
+			R1024_8_rounds(2);
+#endif
+#if R1024_Unroll_R(3)
+			R1024_8_rounds(3);
+#endif
+#if R1024_Unroll_R(4)
+			R1024_8_rounds(4);
+#endif
+#if R1024_Unroll_R(5)
+			R1024_8_rounds(5);
+#endif
+#if R1024_Unroll_R(6)
+			R1024_8_rounds(6);
+#endif
+#if R1024_Unroll_R(7)
+			R1024_8_rounds(7);
+#endif
+#if R1024_Unroll_R(8)
+			R1024_8_rounds(8);
+#endif
+#if R1024_Unroll_R(9)
+			R1024_8_rounds(9);
+#endif
+#if R1024_Unroll_R(10)
+			R1024_8_rounds(10);
+#endif
+#if R1024_Unroll_R(11)
+			R1024_8_rounds(11);
+#endif
+#if R1024_Unroll_R(12)
+			R1024_8_rounds(12);
+#endif
+#if R1024_Unroll_R(13)
+			R1024_8_rounds(13);
+#endif
+#if R1024_Unroll_R(14)
+			R1024_8_rounds(14);
+#endif
+#if(SKEIN_UNROLL_1024 > 14)
+#error "need more unrolling in Skein_1024_Process_Block"
+#endif
 		}
 		/* do the final "feedforward" xor, update context chaining vars */
 
-		ctx->X[ 0] = X00 ^ w[ 0];
-		ctx->X[ 1] = X01 ^ w[ 1];
-		ctx->X[ 2] = X02 ^ w[ 2];
-		ctx->X[ 3] = X03 ^ w[ 3];
-		ctx->X[ 4] = X04 ^ w[ 4];
-		ctx->X[ 5] = X05 ^ w[ 5];
-		ctx->X[ 6] = X06 ^ w[ 6];
-		ctx->X[ 7] = X07 ^ w[ 7];
-		ctx->X[ 8] = X08 ^ w[ 8];
-		ctx->X[ 9] = X09 ^ w[ 9];
+		ctx->X[0] = X00 ^ w[0];
+		ctx->X[1] = X01 ^ w[1];
+		ctx->X[2] = X02 ^ w[2];
+		ctx->X[3] = X03 ^ w[3];
+		ctx->X[4] = X04 ^ w[4];
+		ctx->X[5] = X05 ^ w[5];
+		ctx->X[6] = X06 ^ w[6];
+		ctx->X[7] = X07 ^ w[7];
+		ctx->X[8] = X08 ^ w[8];
+		ctx->X[9] = X09 ^ w[9];
 		ctx->X[10] = X10 ^ w[10];
 		ctx->X[11] = X11 ^ w[11];
 		ctx->X[12] = X12 ^ w[12];
@@ -1155,30 +1300,28 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ctx->X[14] = X14 ^ w[14];
 		ctx->X[15] = X15 ^ w[15];
 
-		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 		blkPtr += SKEIN1024_BLOCK_BYTES;
-		}
-	while (--blkCnt);
+	} while(--blkCnt);
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
-	}
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein1024_Process_Block_CodeSize(void)
-	{
-	return ((u08b_t *) Skein1024_Process_Block_CodeSize) -
-		   ((u08b_t *) Skein1024_Process_Block);
-	}
+{
+	return ((u08b_t*)Skein1024_Process_Block_CodeSize) -
+		   ((u08b_t*)Skein1024_Process_Block);
+}
 static uint_t Skein1024_Unroll_Cnt(void)
-	{
+{
 	return SKEIN_UNROLL_1024;
-	}
+}
 #endif
 #endif
 
-
 #if 0
 /*****************************************************************/
 /*     256-bit Skein                                             */
@@ -1289,93 +1432,93 @@ static int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process the input bytes */
-static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
-	{
+static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt)
+{
 	size_t n;
 
-	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
 	/* process full blocks, if any */
-	if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+	{
+		if(ctx->h.bCnt) /* finish up any buffered message data */
 		{
-		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+			if(n)
 			{
-			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
-			if (n)
-				{
-				Skein_assert(n < msgByteCnt);         /* check on our logic here */
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				Skein_assert(n < msgByteCnt); /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
-				}
+			}
 			Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
-			Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES);
+			Skein_256_Process_Block(ctx, ctx->b, 1, SKEIN_256_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
-			}
+		}
 		/* now process any remaining full blocks, directly from input message data */
-		if (msgByteCnt > SKEIN_256_BLOCK_BYTES)
-			{
-			n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES;   /* number of full blocks to process */
-			Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES);
+		if(msgByteCnt > SKEIN_256_BLOCK_BYTES)
+		{
+			n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */
+			Skein_256_Process_Block(ctx, msg, n, SKEIN_256_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
-			msg        += n * SKEIN_256_BLOCK_BYTES;
-			}
-		Skein_assert(ctx->h.bCnt == 0);
+			msg += n * SKEIN_256_BLOCK_BYTES;
 		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
 
 	/* copy any remaining source message data bytes into b[] */
-	if (msgByteCnt)
-		{
+	if(msgByteCnt)
+	{
 		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
-		}
+	}
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
-static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
-	{
-	size_t i,n,byteCnt;
+static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal)
+{
+	size_t i, n, byteCnt;
 	u64b_t X[SKEIN_256_STATE_WORDS];
-	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
-	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
-	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)            /* zero pad b[] if necessary */
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;		/* tag as the final block */
+	if(ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
 
-	Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */
 
 	/* now output the result */
-	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
 
 	/* run Threefish in "counter mode" to generate output */
-	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
-	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
-	for (i=0;i < byteCnt;i += SKEIN_256_BLOCK_BYTES)
-		{
-		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
-		n = byteCnt - i;   /* number of output bytes left to go */
-		if (n >= SKEIN_256_BLOCK_BYTES)
-			n  = SKEIN_256_BLOCK_BYTES;
-		Skein_Put64_LSB_First(hashVal+i,ctx->X,n);   /* "output" the ctr mode bytes */
-		Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
-		}
-	return SKEIN_SUCCESS;
+	memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+	memcpy(X, ctx->X, sizeof(X));	  /* keep a local copy of counter mode "key" */
+	for(i = 0; i < byteCnt; i += SKEIN_256_BLOCK_BYTES)
+	{
+		((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i;										 /* number of output bytes left to go */
+		if(n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i, ctx->X, n); /* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n, hashVal + i * SKEIN_256_BLOCK_BYTES);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
+	return SKEIN_SUCCESS;
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_256_API_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_256_API_CodeSize) -
-		   ((u08b_t *) Skein_256_Init);
-	}
+{
+	return ((u08b_t*)Skein_256_API_CodeSize) -
+		   ((u08b_t*)Skein_256_Init);
+}
 #endif
 
 /*****************************************************************/
@@ -1384,47 +1527,54 @@ static size_t Skein_256_API_CodeSize(void)
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* init the context for a straight hashing operation  */
-static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
-	{
-	union
-		{
-		u08b_t  b[SKEIN_512_STATE_BYTES];
-		u64b_t  w[SKEIN_512_STATE_WORDS];
-		} cfg;                              /* config block */
+static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen)
+{
+	union {
+		u08b_t b[SKEIN_512_STATE_BYTES];
+		u64b_t w[SKEIN_512_STATE_WORDS];
+	} cfg; /* config block */
 
-	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
-	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
 
-	switch (hashBitLen)
-		{             /* use pre-computed values, where available */
+	switch(hashBitLen)
+	{ /* use pre-computed values, where available */
 #ifndef SKEIN_NO_PRECOMP
-		case  512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X));  break;
-		case  384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X));  break;
-		case  256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X));  break;
-		case  224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X));  break;
+	case 512:
+		memcpy(ctx->X, SKEIN_512_IV_512, sizeof(ctx->X));
+		break;
+	case 384:
+		memcpy(ctx->X, SKEIN_512_IV_384, sizeof(ctx->X));
+		break;
+	case 256:
+		memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X));
+		break;
+	case 224:
+		memcpy(ctx->X, SKEIN_512_IV_224, sizeof(ctx->X));
+		break;
 #endif
-		default:
-			/* here if there is no precomputed IV value available */
-			/* build/process the config block, type == CONFIG (could be precomputed) */
-			Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
-
-			cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
-			cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
-			cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
-			memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
-
-			/* compute the initial chaining values from config block */
-			memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
-			Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
-			break;
-		}
+	default:
+		/* here if there is no precomputed IV value available */
+		/* build/process the config block, type == CONFIG (could be precomputed) */
+		Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
+
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
+		cfg.w[1] = Skein_Swap64(hashBitLen);	   /* hash result length in bits */
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */
+
+		/* compute the initial chaining values from config block */
+		memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */
+		Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
 
 	/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
 	/* Set up to process the data message portion of the hash (default) */
-	Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+	Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 #if 0
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -1489,93 +1639,93 @@ static int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process the input bytes */
-static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
-	{
+static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt)
+{
 	size_t n;
 
-	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
 	/* process full blocks, if any */
-	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+	{
+		if(ctx->h.bCnt) /* finish up any buffered message data */
 		{
-		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+			if(n)
 			{
-			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
-			if (n)
-				{
-				Skein_assert(n < msgByteCnt);         /* check on our logic here */
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				Skein_assert(n < msgByteCnt); /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
-				}
+			}
 			Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
-			Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+			Skein_512_Process_Block(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
-			}
+		}
 		/* now process any remaining full blocks, directly from input message data */
-		if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
-			{
-			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
-			Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+		if(msgByteCnt > SKEIN_512_BLOCK_BYTES)
+		{
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */
+			Skein_512_Process_Block(ctx, msg, n, SKEIN_512_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
-			msg        += n * SKEIN_512_BLOCK_BYTES;
-			}
-		Skein_assert(ctx->h.bCnt == 0);
+			msg += n * SKEIN_512_BLOCK_BYTES;
 		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
 
 	/* copy any remaining source message data bytes into b[] */
-	if (msgByteCnt)
-		{
+	if(msgByteCnt)
+	{
 		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
-		}
+	}
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
-static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
-	{
-	size_t i,n,byteCnt;
+static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal)
+{
+	size_t i, n, byteCnt;
 	u64b_t X[SKEIN_512_STATE_WORDS];
-	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
-	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
-	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;		/* tag as the final block */
+	if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 
-	Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */
 
 	/* now output the result */
-	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
 
 	/* run Threefish in "counter mode" to generate output */
-	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
-	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
-	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
-		{
-		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
-		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
-		if (n >= SKEIN_512_BLOCK_BYTES)
-			n  = SKEIN_512_BLOCK_BYTES;
-		Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
-		Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
-		}
-	return SKEIN_SUCCESS;
+	memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+	memcpy(X, ctx->X, sizeof(X));	  /* keep a local copy of counter mode "key" */
+	for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++)
+	{
+		((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;				 /* number of output bytes left to go */
+		if(n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */
+		Skein_Show_Final(512, &ctx->h, n, hashVal + i * SKEIN_512_BLOCK_BYTES);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
+	return SKEIN_SUCCESS;
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_512_API_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_512_API_CodeSize) -
-		   ((u08b_t *) Skein_512_Init);
-	}
+{
+	return ((u08b_t*)Skein_512_API_CodeSize) -
+		   ((u08b_t*)Skein_512_Init);
+}
 #endif
 
 /*****************************************************************/
@@ -1583,46 +1733,51 @@ static size_t Skein_512_API_CodeSize(void)
 /*****************************************************************/
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* init the context for a straight hashing operation  */
-static int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
-	{
-	union
-		{
-		u08b_t  b[SKEIN1024_STATE_BYTES];
-		u64b_t  w[SKEIN1024_STATE_WORDS];
-		} cfg;                              /* config block */
+static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen)
+{
+	union {
+		u08b_t b[SKEIN1024_STATE_BYTES];
+		u64b_t w[SKEIN1024_STATE_WORDS];
+	} cfg; /* config block */
 
-	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
-	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
 
-	switch (hashBitLen)
-		{              /* use pre-computed values, where available */
+	switch(hashBitLen)
+	{ /* use pre-computed values, where available */
 #ifndef SKEIN_NO_PRECOMP
-		case  512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break;
-		case  384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break;
-		case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break;
+	case 512:
+		memcpy(ctx->X, SKEIN1024_IV_512, sizeof(ctx->X));
+		break;
+	case 384:
+		memcpy(ctx->X, SKEIN1024_IV_384, sizeof(ctx->X));
+		break;
+	case 1024:
+		memcpy(ctx->X, SKEIN1024_IV_1024, sizeof(ctx->X));
+		break;
 #endif
-		default:
-			/* here if there is no precomputed IV value available */
-			/* build/process the config block, type == CONFIG (could be precomputed) */
-			Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
-
-			cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
-			cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
-			cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
-			memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
-
-			/* compute the initial chaining values from config block */
-			memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
-			Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
-			break;
-		}
+	default:
+		/* here if there is no precomputed IV value available */
+		/* build/process the config block, type == CONFIG (could be precomputed) */
+		Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
+
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
+		cfg.w[1] = Skein_Swap64(hashBitLen);	   /* hash result length in bits */
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */
+
+		/* compute the initial chaining values from config block */
+		memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */
+		Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
 
 	/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
 	/* Set up to process the data message portion of the hash (default) */
-	Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+	Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 #if 0
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -1687,93 +1842,93 @@ static int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process the input bytes */
-static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
-	{
+static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt)
+{
 	size_t n;
 
-	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
 	/* process full blocks, if any */
-	if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+	{
+		if(ctx->h.bCnt) /* finish up any buffered message data */
 		{
-		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+			if(n)
 			{
-			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
-			if (n)
-				{
-				Skein_assert(n < msgByteCnt);         /* check on our logic here */
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				Skein_assert(n < msgByteCnt); /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
-				}
+			}
 			Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
-			Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES);
+			Skein1024_Process_Block(ctx, ctx->b, 1, SKEIN1024_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
-			}
+		}
 		/* now process any remaining full blocks, directly from input message data */
-		if (msgByteCnt > SKEIN1024_BLOCK_BYTES)
-			{
-			n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES;   /* number of full blocks to process */
-			Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES);
+		if(msgByteCnt > SKEIN1024_BLOCK_BYTES)
+		{
+			n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */
+			Skein1024_Process_Block(ctx, msg, n, SKEIN1024_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
-			msg        += n * SKEIN1024_BLOCK_BYTES;
-			}
-		Skein_assert(ctx->h.bCnt == 0);
+			msg += n * SKEIN1024_BLOCK_BYTES;
 		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
 
 	/* copy any remaining source message data bytes into b[] */
-	if (msgByteCnt)
-		{
+	if(msgByteCnt)
+	{
 		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
-		}
+	}
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
-static int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
-	{
-	size_t i,n,byteCnt;
+static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal)
+{
+	size_t i, n, byteCnt;
 	u64b_t X[SKEIN1024_STATE_WORDS];
-	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
-	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
-	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)            /* zero pad b[] if necessary */
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;		/* tag as the final block */
+	if(ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
 
-	Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */
 
 	/* now output the result */
-	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
 
 	/* run Threefish in "counter mode" to generate output */
-	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
-	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
-	for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
-		{
-		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
-		n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
-		if (n >= SKEIN1024_BLOCK_BYTES)
-			n  = SKEIN1024_BLOCK_BYTES;
-		Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
-		Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
-		}
-	return SKEIN_SUCCESS;
+	memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+	memcpy(X, ctx->X, sizeof(X));	  /* keep a local copy of counter mode "key" */
+	for(i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++)
+	{
+		((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;				 /* number of output bytes left to go */
+		if(n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */
+		Skein_Show_Final(1024, &ctx->h, n, hashVal + i * SKEIN1024_BLOCK_BYTES);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
+	return SKEIN_SUCCESS;
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein1024_API_CodeSize(void)
-	{
-	return ((u08b_t *) Skein1024_API_CodeSize) -
-		   ((u08b_t *) Skein1024_Init);
-	}
+{
+	return ((u08b_t*)Skein1024_API_CodeSize) -
+		   ((u08b_t*)Skein1024_Init);
+}
 #endif
 
 /**************** Functions to support MAC/tree hashing ***************/
@@ -1828,7 +1983,6 @@ static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
 	return SKEIN_SUCCESS;
 	}
 
-
 #if SKEIN_TREE_HASH
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* just do the OUTPUT stage                                       */
@@ -1921,116 +2075,126 @@ static int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
 
 typedef struct
 {
-  uint_t  statebits;                      /* 256, 512, or 1024 */
-  union
-  {
-	Skein_Ctxt_Hdr_t h;                 /* common header "overlay" */
-	Skein_256_Ctxt_t ctx_256;
-	Skein_512_Ctxt_t ctx_512;
-	Skein1024_Ctxt_t ctx1024;
-  } u;
-}
-hashState;
+	uint_t statebits; /* 256, 512, or 1024 */
+	union {
+		Skein_Ctxt_Hdr_t h; /* common header "overlay" */
+		Skein_256_Ctxt_t ctx_256;
+		Skein_512_Ctxt_t ctx_512;
+		Skein1024_Ctxt_t ctx1024;
+	} u;
+} hashState;
 
 /* "incremental" hashing API */
-static SkeinHashReturn Init  (hashState *state, int hashbitlen);
-static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen);
-static SkeinHashReturn Final (hashState *state,       SkeinBitSequence *hashval);
+static SkeinHashReturn Init(hashState* state, int hashbitlen);
+static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen);
+static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval);
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* select the context size and init the context */
-static SkeinHashReturn Init(hashState *state, int hashbitlen)
+static SkeinHashReturn Init(hashState* state, int hashbitlen)
 {
 #if SKEIN_256_NIST_MAX_HASH_BITS
-  if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
-  {
-	Skein_Assert(hashbitlen > 0,BAD_HASHLEN);
-	state->statebits = 64*SKEIN_256_STATE_WORDS;
-	return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
-  }
-#endif
-  if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
-  {
-	state->statebits = 64*SKEIN_512_STATE_WORDS;
-	return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
-  }
-  else
-  {
-	state->statebits = 64*SKEIN1024_STATE_WORDS;
-	return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
-  }
+	if(hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
+	{
+		Skein_Assert(hashbitlen > 0, BAD_HASHLEN);
+		state->statebits = 64 * SKEIN_256_STATE_WORDS;
+		return Skein_256_Init(&state->u.ctx_256, (size_t)hashbitlen);
+	}
+#endif
+	if(hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
+	{
+		state->statebits = 64 * SKEIN_512_STATE_WORDS;
+		return Skein_512_Init(&state->u.ctx_512, (size_t)hashbitlen);
+	}
+	else
+	{
+		state->statebits = 64 * SKEIN1024_STATE_WORDS;
+		return Skein1024_Init(&state->u.ctx1024, (size_t)hashbitlen);
+	}
 }
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process data to be hashed */
-static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen)
+static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen)
 {
-  /* only the final Update() call is allowed do partial bytes, else assert an error */
-  Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL);
+	/* only the final Update() call is allowed do partial bytes, else assert an error */
+	Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL);
 
-  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,SKEIN_FAIL);
-  if ((databitlen & 7) == 0)  /* partial bytes? */
-  {
-	switch ((state->statebits >> 8) & 3)
+	Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, SKEIN_FAIL);
+	if((databitlen & 7) == 0) /* partial bytes? */
 	{
-	case 2:  return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
-	case 1:  return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
-	case 0:  return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
-	default: return SKEIN_FAIL;
+		switch((state->statebits >> 8) & 3)
+		{
+		case 2:
+			return Skein_512_Update(&state->u.ctx_512, data, databitlen >> 3);
+		case 1:
+			return Skein_256_Update(&state->u.ctx_256, data, databitlen >> 3);
+		case 0:
+			return Skein1024_Update(&state->u.ctx1024, data, databitlen >> 3);
+		default:
+			return SKEIN_FAIL;
+		}
 	}
-  }
-  else
-  {   /* handle partial final byte */
-	size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
-	u08b_t b,mask;
+	else
+	{										 /* handle partial final byte */
+		size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */
+		u08b_t b, mask;
 
-	mask = (u08b_t) (1u << (7 - (databitlen & 7)));       /* partial byte bit mask */
-	b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+		mask = (u08b_t)(1u << (7 - (databitlen & 7)));		/* partial byte bit mask */
+		b = (u08b_t)((data[bCnt - 1] & (0 - mask)) | mask); /* apply bit padding on final byte */
 
-	switch ((state->statebits >> 8) & 3)
-	{
-	case 2:  Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte    */
-	  Skein_512_Update(&state->u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
-	  break;
-	case 1:  Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte    */
-	  Skein_256_Update(&state->u.ctx_256,&b  ,  1   ); /* process the (masked) partial byte */
-	  break;
-	case 0:  Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte    */
-	  Skein1024_Update(&state->u.ctx1024,&b  ,  1   ); /* process the (masked) partial byte */
-	  break;
-	default: return SKEIN_FAIL;
-	}
-	Skein_Set_Bit_Pad_Flag(state->u.h);                    /* set tweak flag for the final call */
+		switch((state->statebits >> 8) & 3)
+		{
+		case 2:
+			Skein_512_Update(&state->u.ctx_512, data, bCnt - 1); /* process all but the final byte    */
+			Skein_512_Update(&state->u.ctx_512, &b, 1);			 /* process the (masked) partial byte */
+			break;
+		case 1:
+			Skein_256_Update(&state->u.ctx_256, data, bCnt - 1); /* process all but the final byte    */
+			Skein_256_Update(&state->u.ctx_256, &b, 1);			 /* process the (masked) partial byte */
+			break;
+		case 0:
+			Skein1024_Update(&state->u.ctx1024, data, bCnt - 1); /* process all but the final byte    */
+			Skein1024_Update(&state->u.ctx1024, &b, 1);			 /* process the (masked) partial byte */
+			break;
+		default:
+			return SKEIN_FAIL;
+		}
+		Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */
 
-	return SKEIN_SUCCESS;
-  }
+		return SKEIN_SUCCESS;
+	}
 }
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize hash computation and output the result (hashbitlen bits) */
-static SkeinHashReturn Final(hashState *state, SkeinBitSequence *hashval)
+static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval)
 {
-  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
-  switch ((state->statebits >> 8) & 3)
-  {
-  case 2:  return Skein_512_Final(&state->u.ctx_512,hashval);
-  case 1:  return Skein_256_Final(&state->u.ctx_256,hashval);
-  case 0:  return Skein1024_Final(&state->u.ctx1024,hashval);
-  default: return SKEIN_FAIL;
-  }
+	Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, FAIL);
+	switch((state->statebits >> 8) & 3)
+	{
+	case 2:
+		return Skein_512_Final(&state->u.ctx_512, hashval);
+	case 1:
+		return Skein_256_Final(&state->u.ctx_256, hashval);
+	case 0:
+		return Skein1024_Final(&state->u.ctx1024, hashval);
+	default:
+		return SKEIN_FAIL;
+	}
 }
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* all-in-one hash function */
-SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, /* all-in-one call */
-				SkeinDataLength databitlen,SkeinBitSequence *hashval)
+SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data, /* all-in-one call */
+	SkeinDataLength databitlen, SkeinBitSequence* hashval)
 {
-  hashState  state;
-  SkeinHashReturn r = Init(&state,hashbitlen);
-  if (r == SKEIN_SUCCESS)
-  { /* these calls do not fail when called properly */
-	r = Update(&state,data,databitlen);
-	Final(&state,hashval);
-  }
-  return r;
+	hashState state;
+	SkeinHashReturn r = Init(&state, hashbitlen);
+	if(r == SKEIN_SUCCESS)
+	{ /* these calls do not fail when called properly */
+		r = Update(&state, data, databitlen);
+		Final(&state, hashval);
+	}
+	return r;
 }
diff --git a/xmrstak/backend/cpu/crypto/c_skein.h b/xmrstak/backend/cpu/crypto/c_skein.h
index 1aa11dea3..52f359e82 100644
--- a/xmrstak/backend/cpu/crypto/c_skein.h
+++ b/xmrstak/backend/cpu/crypto/c_skein.h
@@ -1,5 +1,5 @@
 #ifndef _SKEIN_H_
-#define _SKEIN_H_     1
+#define _SKEIN_H_ 1
 /**************************************************************************
 **
 ** Interface declarations and internal definitions for Skein hashing.
@@ -27,21 +27,20 @@
 **                                1: return SKEIN_FAIL to flag errors
 **
 ***************************************************************************/
-#include "skein_port.h"                      /* get platform-specific definitions */
+#include "skein_port.h" /* get platform-specific definitions */
 
 typedef enum
 {
-  SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
-  SKEIN_FAIL            =      1,
-  SKEIN_BAD_HASHLEN     =      2
-}
-SkeinHashReturn;
+	SKEIN_SUCCESS = 0, /* return codes from Skein calls */
+	SKEIN_FAIL = 1,
+	SKEIN_BAD_HASHLEN = 2
+} SkeinHashReturn;
 
-typedef uint32_t SkeinDataLength;                /* bit count  type */
-typedef u08b_t   SkeinBitSequence;               /* bit stream type */
+typedef uint32_t SkeinDataLength; /* bit count  type */
+typedef u08b_t SkeinBitSequence;  /* bit stream type */
 
 /* "all-in-one" call */
-SkeinHashReturn skein_hash(int hashbitlen,   const SkeinBitSequence *data,
-                           SkeinDataLength databitlen, SkeinBitSequence *hashval);
+SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data,
+	SkeinDataLength databitlen, SkeinBitSequence* hashval);
 
-#endif  /* ifndef _SKEIN_H_ */
+#endif /* ifndef _SKEIN_H_ */
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
index 5844d3814..2d333d118 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu.hpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
@@ -4,8 +4,8 @@
 #include <stdint.h>
 
 #if defined(_WIN32) || defined(_WIN64)
-#include <malloc.h>
 #include <intrin.h>
+#include <malloc.h>
 #define HAS_WIN_INTRIN_API
 #endif
 
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
index 79b38373a..efded74c8 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
@@ -1,12 +1,12 @@
-#include "cn_gpu.hpp"
 #include "../../cryptonight.hpp"
+#include "cn_gpu.hpp"
 
-#pragma GCC target ("avx2")
+#pragma GCC target("avx2")
 #ifndef _mm256_bslli_epi128
-	#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
 #endif
 #ifndef _mm256_bsrli_epi128
-	#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
 #endif
 
 inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01)
@@ -67,7 +67,7 @@ inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2,
 // 112×4 = 448
 template <bool add>
 inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
-							  float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
+	float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
 {
 	__m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1);
 	__m256 r = _mm256_setzero_ps();
@@ -92,7 +92,7 @@ inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256&
 
 template <size_t rot>
 inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
-								float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
+	float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
 {
 	__m256i r = double_comupte<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum);
 	if(rot != 0)
@@ -101,9 +101,7 @@ inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256
 	out = _mm256_xor_si256(out, r);
 }
 
-
-inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n*16); }
-
+inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n * 16); }
 
 void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
 {
@@ -155,7 +153,7 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& al
 		sum1 = _mm256_add_ps(suma, sumb);
 
 		out2 = _mm256_xor_si256(out2, out);
-		out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2);
+		out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2, out2, 0x41), out2);
 		suma = _mm256_permute2f128_ps(sum0, sum1, 0x30);
 		sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21);
 		sum0 = _mm256_add_ps(suma, sumb);
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
index c8627d8b8..d65d9651e 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
@@ -1,7 +1,7 @@
-#include "cn_gpu.hpp"
 #include "../../cryptonight.hpp"
+#include "cn_gpu.hpp"
 
-#pragma GCC target ("sse2")
+#pragma GCC target("sse2")
 
 inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
 {
@@ -21,13 +21,13 @@ inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c,
 {
 	n1 = _mm_add_ps(n1, c);
 	__m128 nn = _mm_mul_ps(n0, c);
-	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn));
 	nn = fma_break(nn);
 	n = _mm_add_ps(n, nn);
 
 	n3 = _mm_sub_ps(n3, c);
 	__m128 dd = _mm_mul_ps(n2, c);
-	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd));
 	dd = fma_break(dd);
 	d = _mm_add_ps(d, dd);
 
@@ -57,12 +57,12 @@ inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd
 	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
 	d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d);
 	d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d);
-	r =_mm_add_ps(r, _mm_div_ps(n,d));
+	r = _mm_add_ps(r, _mm_div_ps(n, d));
 }
 
 // 112×4 = 448
-template<bool add>
-inline __m128i single_comupte(__m128 n0, __m128 n1,  __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+template <bool add>
+inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
 {
 	__m128 c = _mm_set1_ps(cnt);
 	__m128 r = _mm_setzero_ps();
@@ -85,8 +85,8 @@ inline __m128i single_comupte(__m128 n0, __m128 n1,  __m128 n2,  __m128 n3, floa
 	return _mm_cvttps_epi32(r);
 }
 
-template<size_t rot>
-inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+template <size_t rot>
+inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
 {
 	__m128i r = single_comupte<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
 	if(rot != 0)
@@ -94,7 +94,7 @@ inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, flo
 	out = _mm_xor_si128(out, r);
 }
 
-inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n*16); }
+inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n * 16); }
 
 void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
 {
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
index 488805ec0..2a91269f8 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -1,6 +1,6 @@
 #pragma once
-#include <stddef.h>
 #include <inttypes.h>
+#include <stddef.h>
 
 #include "variant4_random_math.h"
 
@@ -12,8 +12,8 @@
 
 struct cryptonight_ctx;
 
-typedef void  (*cn_mainloop_fun)(cryptonight_ctx *ctx);
-typedef void  (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*);
+typedef void (*cn_mainloop_fun)(cryptonight_ctx* ctx);
+typedef void (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*);
 typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size);
@@ -36,8 +36,7 @@ struct cryptonight_ctx
 	int asm_version = 0;
 	xmrstak_algo last_algo = invalid_algo;
 
-	union
-	{
+	union {
 		extra_ctx_r cn_r_ctx;
 	};
 
@@ -51,5 +50,3 @@ struct alloc_msg
 size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 void cryptonight_free_ctx(cryptonight_ctx* ctx);
-
-
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index d7316b25e..6c9e3390c 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -15,22 +15,24 @@
   */
 #pragma once
 
-#include "cryptonight.h"
-#include "xmrstak/backend/cryptonight.hpp"
 #include "../../miner_work.hpp"
 #include "cn_gpu.hpp"
+#include "cryptonight.h"
+#include "xmrstak/backend/cryptonight.hpp"
+#include <cfenv>
 #include <memory.h>
 #include <stdio.h>
-#include <cfenv>
 #include <utility>
 
 #ifdef _WIN64
-#	include <winsock2.h>
-#	include <windows.h>
-#	include <ntsecapi.h>
-#	include <tchar.h>
+#include <winsock2.h>
+// this comment disable clang include reordering
+#include <ntsecapi.h>
+#include <tchar.h>
+// this comment disable clang include reordering for windows.h
+#include <windows.h>
 #else
-#	include <sys/mman.h>
+#include <sys/mman.h>
 #endif
 
 #ifdef __GNUC__
@@ -54,9 +56,9 @@ static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
 
 extern "C"
 {
-	void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
+	void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen);
 	void keccakf(uint64_t st[25], int rounds);
-	extern void(*const extra_hashes[4])(const void *, uint32_t, char *);
+	extern void (*const extra_hashes[4])(const void*, uint32_t, char*);
 }
 
 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
@@ -73,7 +75,7 @@ static inline __m128i sl_xor(__m128i tmp1)
 	return tmp1;
 }
 
-template<uint8_t rcon>
+template <uint8_t rcon>
 static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2)
 {
 	__m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
@@ -98,14 +100,14 @@ static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t r
 	*xout2 = _mm_xor_si128(*xout2, xout1);
 }
 
-template<bool SOFT_AES>
+template <bool SOFT_AES>
 static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3,
 	__m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
 {
 	__m128i xout0, xout2;
 
 	xout0 = _mm_load_si128(memory);
-	xout2 = _mm_load_si128(memory+1);
+	xout2 = _mm_load_si128(memory + 1);
 	*k0 = xout0;
 	*k1 = xout2;
 
@@ -175,7 +177,7 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3
 	x7 = _mm_xor_si128(x7, tmp0);
 }
 
-template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+template <bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
 void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
 	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast;
@@ -197,7 +199,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 
 	if(HEAVY_MIX)
 	{
-		for(size_t i=0; i < 16; i++)
+		for(size_t i = 0; i < 16; i++)
 		{
 			if(SOFT_AES)
 			{
@@ -230,7 +232,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 	}
 
 	const size_t MEM = algo.Mem();
-	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+	for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(SOFT_AES)
 		{
@@ -277,29 +279,29 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 	}
 }
 
-template<bool PREFETCH, xmrstak_algo_id ALGO>
+template <bool PREFETCH, xmrstak_algo_id ALGO>
 void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrstak_algo& algo)
 {
 	constexpr size_t hash_size = 200; // 25x8 bytes
 	alignas(128) uint64_t hash[25];
 	const size_t mem = algo.Mem();
 
-	for (uint64_t i = 0; i < mem / 512; i++)
+	for(uint64_t i = 0; i < mem / 512; i++)
 	{
 		memcpy(hash, input, hash_size);
 		hash[0] ^= i;
 
 		keccakf(hash, 24);
 		memcpy(output, hash, 160);
-		output+=160;
+		output += 160;
 
 		keccakf(hash, 24);
 		memcpy(output, hash, 176);
-		output+=176;
+		output += 176;
 
 		keccakf(hash, 24);
 		memcpy(output, hash, 176);
-		output+=176;
+		output += 176;
 
 		if(PREFETCH)
 		{
@@ -311,11 +313,11 @@ void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrs
 	}
 }
 
-template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+template <bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
 void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
 	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
-		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu;
+							   ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu;
 
 	// This is more than we have registers, compiler will assign 2 keys on the stack
 	__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
@@ -333,7 +335,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 	xout7 = _mm_load_si128(output + 11);
 
 	const size_t MEM = algo.Mem();
-	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+	for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(PREFETCH)
 			_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
@@ -384,7 +386,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 
 	if(HEAVY_MIX)
 	{
-		for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+		for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 		{
 			if(PREFETCH)
 				_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
@@ -433,7 +435,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 				mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 		}
 
-		for(size_t i=0; i < 16; i++)
+		for(size_t i = 0; i < 16; i++)
 		{
 			if(SOFT_AES)
 			{
@@ -494,7 +496,8 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
 #else
 	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
 	// Fallback to simpler code
-	if (x2 < n0) ++r;
+	if(x2 < n0)
+		++r;
 #endif
 	return r;
 }
@@ -505,7 +508,7 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 	alignas(16) uint32_t x[4];
 	_mm_store_si128((__m128i*)k, key);
 	_mm_store_si128((__m128i*)x, _mm_xor_si128(val, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))); // x = ~val
-	#define BYTE(p, i) ((unsigned char*)&p)[i]
+#define BYTE(p, i) ((unsigned char*)&p)[i]
 	k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
 	x[0] ^= k[0];
 	k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
@@ -513,11 +516,11 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 	k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
 	x[2] ^= k[2];
 	k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
-	#undef BYTE
+#undef BYTE
 	return _mm_load_si128((__m128i*)k);
 }
 
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 {
 	mem_out[0] = _mm_cvtsi128_si64(tmp);
@@ -541,7 +544,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 
 		mem_out[1] = vh;
 	}
-
 }
 
 /** optimal type for sqrt
@@ -550,18 +552,18 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
  *
  * @tparam N number of hashes per thread
  */
-template<size_t N>
+template <size_t N>
 struct GetOptimalSqrtType
 {
 	using type = __m128i;
 };
 
-template<>
+template <>
 struct GetOptimalSqrtType<1u>
 {
 	using type = uint64_t;
 };
-template<size_t N>
+template <size_t N>
 using GetOptimalSqrtType_t = typename GetOptimalSqrtType<N>::type;
 
 /** assign a value and convert if necessary
@@ -625,273 +627,275 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 	cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
 }
 
-#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \
-	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx)                              \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */              \
 	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
-		if (ALGO == cryptonight_r) \
-			cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \
-	} \
-	if(ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	{                                                                                       \
+		const uint64_t idx1 = idx0 & MASK;                                                  \
+		const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]);                  \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                  \
+		const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));            \
+		if(ALGO == cryptonight_r)                                                           \
+			cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2));   \
+	}                                                                                       \
+	if(ALGO == cryptonight_v8_reversewaltz)                                                 \
+	{                                                                                       \
+		const uint64_t idx1 = idx0 & MASK;                                                  \
+		const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]);                  \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                  \
+		const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));            \
 	}
 
-#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
-	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		hi ^= ((uint64_t*)&chunk2)[0]; \
-		lo ^= ((uint64_t*)&chunk2)[1]; \
-		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
-	} \
-	if(ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		hi ^= ((uint64_t*)&chunk2)[0]; \
-		lo ^= ((uint64_t*)&chunk2)[1]; \
-		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi)                                                \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */                                    \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow)                                                \
+	{                                                                                                             \
+		const uint64_t idx1 = idx0 & MASK;                                                                        \
+		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                                        \
+		hi ^= ((uint64_t*)&chunk2)[0];                                                                            \
+		lo ^= ((uint64_t*)&chunk2)[1];                                                                            \
+		const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                                        \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));                                  \
+	}                                                                                                             \
+	if(ALGO == cryptonight_v8_reversewaltz)                                                                       \
+	{                                                                                                             \
+		const uint64_t idx1 = idx0 & MASK;                                                                        \
+		const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                                        \
+		hi ^= ((uint64_t*)&chunk2)[0];                                                                            \
+		lo ^= ((uint64_t*)&chunk2)[1];                                                                            \
+		const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                                        \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));                                  \
 	}
 
-#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		uint64_t sqrt_result_tmp; \
-		assign(sqrt_result_tmp, sqrt_result); \
-		/* Use division and square root results from the _previous_ iteration to hide the latency */ \
-		const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
-		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \
-		const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \
-		/* Most and least significant bits in the divisor are set to 1 \
-		 * to make sure we don't divide by a small or even number, \
-		 * so there are no shortcuts for such cases \
-		 * \
-		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \
-		 * We drop the highest bit to fit both quotient and remainder in 32 bits \
-		 */  \
-		/* Compiler will optimize it to a single div instruction */ \
-		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
-		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
-		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl)                                            \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)                                     \
+	{                                                                                                            \
+		uint64_t sqrt_result_tmp;                                                                                \
+		assign(sqrt_result_tmp, sqrt_result);                                                                    \
+		/* Use division and square root results from the _previous_ iteration to hide the latency */             \
+		const uint64_t cx_64 = _mm_cvtsi128_si64(cx);                                                            \
+		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32);           \
+		const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL;                                      \
+		/* Most and least significant bits in the divisor are set to 1                                           \
+		 * to make sure we don't divide by a small or even number,                                               \
+		 * so there are no shortcuts for such cases                                                              \
+		 *                                                                                                       \
+		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4                             \
+		 * We drop the highest bit to fit both quotient and remainder in 32 bits                                 \
+		 */                                                                                                      \
+		/* Compiler will optimize it to a single div instruction */                                              \
+		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8));                                          \
+		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32);                   \
+		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result));                          \
 		/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
-		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
+		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result));                             \
 	}
 
-#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \
-	if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
-	{ \
+#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data)                                   \
+	if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow)                                     \
+	{                                                                                          \
 		cl ^= (cn_r_data[0] + cn_r_data[1]) | ((uint64_t)(cn_r_data[2] + cn_r_data[3]) << 32); \
-		cn_r_data[4] = static_cast<uint32_t>(al); \
-		cn_r_data[5] = static_cast<uint32_t>(ah); \
-		cn_r_data[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0)); \
-		cn_r_data[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1)); \
-		cn_r_data[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
-		v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \
-	} \
-	if (ALGO == cryptonight_r) \
-	{ \
-		al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \
-		ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \
+		cn_r_data[4] = static_cast<uint32_t>(al);                                              \
+		cn_r_data[5] = static_cast<uint32_t>(ah);                                              \
+		cn_r_data[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0));                          \
+		cn_r_data[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1));                          \
+		cn_r_data[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8)));       \
+		v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data);                                      \
+	}                                                                                          \
+	if(ALGO == cryptonight_r)                                                                  \
+	{                                                                                          \
+		al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32);                                 \
+		ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32);                                 \
 	}
 
-#define CN_INIT_SINGLE \
+#define CN_INIT_SINGLE                                                                                                                                                                                 \
 	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
-	{ \
-		memset(output, 0, 32 * N); \
-		return; \
+	{                                                                                                                                                                                                  \
+		memset(output, 0, 32 * N);                                                                                                                                                                     \
+		return;                                                                                                                                                                                        \
 	}
 
-#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \
-	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
-	uint64_t monero_const; \
+#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data)                                                                   \
+	keccak((const uint8_t*)input + len * n, len, ctx[n]->hash_state, 200);                                                                                                               \
+	uint64_t monero_const;                                                                                                                                                               \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-	{ \
-		monero_const =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len * n + 35); \
-		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24); \
-	} \
-	/* Optim - 99% time boundary */ \
-	cn_explode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \
-	\
-	__m128i ax0; \
-	uint64_t idx0; \
-	__m128i bx0; \
-	uint8_t* l0 = ctx[n]->long_state; \
-	/* BEGIN cryptonight_monero_v8 variables */ \
-	__m128i bx1; \
-	__m128i division_result_xmm; \
-	__m128 conc_var; \
-	if(ALGO == cryptonight_conceal) \
-	{\
-		set_float_rounding_mode_nearest(); \
-		conc_var = _mm_setzero_ps(); \
-	}\
-	GetOptimalSqrtType_t<N> sqrt_result; \
-	uint32_t cn_r_data[9]; \
-	/* END cryptonight_monero_v8 variables */ \
-	{ \
-		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
-		idx0 = h0[0] ^ h0[4]; \
-		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
-		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
-		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \
-		{ \
-			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
-			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
-			assign(sqrt_result, h0[13]); \
-			set_float_rounding_mode(); \
-		} \
-		if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
-		{ \
-			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
-			cn_r_data[0] = (uint32_t)(h0[12]); \
-			cn_r_data[1] = (uint32_t)(h0[12] >> 32); \
-			cn_r_data[2] = (uint32_t)(h0[13]); \
-			cn_r_data[3] = (uint32_t)(h0[13] >> 32); \
-		} \
-	} \
-	__m128i *ptr0
+	{                                                                                                                                                                                    \
+		monero_const = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len * n + 35);                                                                       \
+		monero_const ^= *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24);                                                                                                   \
+	}                                                                                                                                                                                    \
+	/* Optim - 99% time boundary */                                                                                                                                                      \
+	cn_explode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo);                                                                   \
+                                                                                                                                                                                         \
+	__m128i ax0;                                                                                                                                                                         \
+	uint64_t idx0;                                                                                                                                                                       \
+	__m128i bx0;                                                                                                                                                                         \
+	uint8_t* l0 = ctx[n]->long_state;                                                                                                                                                    \
+	/* BEGIN cryptonight_monero_v8 variables */                                                                                                                                          \
+	__m128i bx1;                                                                                                                                                                         \
+	__m128i division_result_xmm;                                                                                                                                                         \
+	__m128 conc_var;                                                                                                                                                                     \
+	if(ALGO == cryptonight_conceal)                                                                                                                                                      \
+	{                                                                                                                                                                                    \
+		set_float_rounding_mode_nearest();                                                                                                                                               \
+		conc_var = _mm_setzero_ps();                                                                                                                                                     \
+	}                                                                                                                                                                                    \
+	GetOptimalSqrtType_t<N> sqrt_result;                                                                                                                                                 \
+	uint32_t cn_r_data[9];                                                                                                                                                               \
+	/* END cryptonight_monero_v8 variables */                                                                                                                                            \
+	{                                                                                                                                                                                    \
+		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state;                                                                                                                                    \
+		idx0 = h0[0] ^ h0[4];                                                                                                                                                            \
+		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0);                                                                                                                                       \
+		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);                                                                                                                              \
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)                                                                                                         \
+		{                                                                                                                                                                                \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);                                                                                                                        \
+			division_result_xmm = _mm_cvtsi64_si128(h0[12]);                                                                                                                             \
+			assign(sqrt_result, h0[13]);                                                                                                                                                 \
+			set_float_rounding_mode();                                                                                                                                                   \
+		}                                                                                                                                                                                \
+		if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow)                                                                                                                           \
+		{                                                                                                                                                                                \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);                                                                                                                        \
+			cn_r_data[0] = (uint32_t)(h0[12]);                                                                                                                                           \
+			cn_r_data[1] = (uint32_t)(h0[12] >> 32);                                                                                                                                     \
+			cn_r_data[2] = (uint32_t)(h0[13]);                                                                                                                                           \
+			cn_r_data[3] = (uint32_t)(h0[13] >> 32);                                                                                                                                     \
+		}                                                                                                                                                                                \
+	}                                                                                                                                                                                    \
+	__m128i* ptr0
 
 #define CN_STEP1(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
-	__m128i cx; \
-	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-	cx = _mm_load_si128(ptr0); \
-	if (ALGO == cryptonight_conceal) \
-		cryptonight_conceal_tweak(cx, conc_var); \
-	if (ALGO == cryptonight_bittube2) \
-	{ \
-		cx = aes_round_bittube2(cx, ax0); \
-	} \
-	else \
-	{ \
-		if(SOFT_AES) \
-			cx = soft_aesenc(cx, ax0); \
-		else \
-			cx = _mm_aesenc_si128(cx, ax0); \
-	} \
+	__m128i cx;                                                                \
+	ptr0 = (__m128i*)&l0[idx0 & MASK];                                         \
+	cx = _mm_load_si128(ptr0);                                                 \
+	if(ALGO == cryptonight_conceal)                                            \
+		cryptonight_conceal_tweak(cx, conc_var);                               \
+	if(ALGO == cryptonight_bittube2)                                           \
+	{                                                                          \
+		cx = aes_round_bittube2(cx, ax0);                                      \
+	}                                                                          \
+	else                                                                       \
+	{                                                                          \
+		if(SOFT_AES)                                                           \
+			cx = soft_aesenc(cx, ax0);                                         \
+		else                                                                   \
+			cx = _mm_aesenc_si128(cx, ax0);                                    \
+	}                                                                          \
 	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx)
 
-#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx)                                                                                                                          \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \
-	else \
-		_mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \
-	idx0 = _mm_cvtsi128_si64(cx); \
-	\
-	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-	if(PREFETCH) \
-		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz) \
-		bx0 = cx
+		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr0, _mm_xor_si128(bx0, cx));                                                                                                         \
+	else                                                                                                                                                                                 \
+		_mm_store_si128((__m128i*)ptr0, _mm_xor_si128(bx0, cx));                                                                                                                         \
+	idx0 = _mm_cvtsi128_si64(cx);                                                                                                                                                        \
+                                                                                                                                                                                         \
+	ptr0 = (__m128i*)&l0[idx0 & MASK];                                                                                                                                                   \
+	if(PREFETCH)                                                                                                                                                                         \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0);                                                                                                                                    \
+	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz)                                                       \
+	bx0 = cx
 
 #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data) \
-	uint64_t lo, cl, ch; \
-	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
-	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
-	cl = ((uint64_t*)ptr0)[0]; \
-	ch = ((uint64_t*)ptr0)[1]; \
-	CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \
-	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
-	{ \
-		uint64_t hi; \
-		lo = _umul128(idx0, cl, &hi); \
-		if(ALGO == cryptonight_r) \
-		{ \
-			CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \
-		} \
-		else \
-		{ \
-			CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
-		} \
-		ah0 += lo; \
-		al0 += hi; \
-	} \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		bx1 = bx0; \
-		bx0 = cx; \
-	} \
-	((uint64_t*)ptr0)[0] = al0; \
-	if(PREFETCH) \
-		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
-
-#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
-	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-	{ \
-		if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
-			((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \
-		else \
-			((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \
-	} \
-	else \
-		((uint64_t*)ptr0)[1] = ah0; \
-	al0 ^= cl; \
-	ah0 ^= ch; \
-	ax0 = _mm_set_epi64x(ah0, al0); \
+	uint64_t lo, cl, ch;                                                                                                                \
+	uint64_t al0 = _mm_cvtsi128_si64(ax0);                                                                                              \
+	uint64_t ah0 = ((uint64_t*)&ax0)[1];                                                                                                \
+	cl = ((uint64_t*)ptr0)[0];                                                                                                          \
+	ch = ((uint64_t*)ptr0)[1];                                                                                                          \
+	CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data);                                                                             \
+	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl);                                                                      \
+	{                                                                                                                                   \
+		uint64_t hi;                                                                                                                    \
+		lo = _umul128(idx0, cl, &hi);                                                                                                   \
+		if(ALGO == cryptonight_r)                                                                                                       \
+		{                                                                                                                               \
+			CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx);                                                                     \
+		}                                                                                                                               \
+		else                                                                                                                            \
+		{                                                                                                                               \
+			CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi);                                                                 \
+		}                                                                                                                               \
+		ah0 += lo;                                                                                                                      \
+		al0 += hi;                                                                                                                      \
+	}                                                                                                                                   \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz)      \
+	{                                                                                                                                   \
+		bx1 = bx0;                                                                                                                      \
+		bx0 = cx;                                                                                                                       \
+	}                                                                                                                                   \
+	((uint64_t*)ptr0)[0] = al0;                                                                                                         \
+	if(PREFETCH)                                                                                                                        \
+	_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
+
+#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0)                                                                                                        \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{                                                                                                                                                                                    \
+		if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)                                                                                                                     \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0];                                                                                                            \
+		else                                                                                                                                                                             \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const;                                                                                                                                   \
+	}                                                                                                                                                                                    \
+	else                                                                                                                                                                                 \
+		((uint64_t*)ptr0)[1] = ah0;                                                                                                                                                      \
+	al0 ^= cl;                                                                                                                                                                           \
+	ah0 ^= ch;                                                                                                                                                                           \
+	ax0 = _mm_set_epi64x(ah0, al0);                                                                                                                                                      \
 	idx0 = al0;
 
-#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
-	{ \
-		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-		int64_t u  = ((int64_t*)ptr0)[0]; \
-		int32_t d  = ((int32_t*)ptr0)[2]; \
-		int64_t q = u / (d | 0x5); \
-		\
-		((int64_t*)ptr0)[0] = u ^ q; \
-		idx0 = d ^ q; \
-	} \
+#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0)             \
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)       \
+	{                                                                   \
+		ptr0 = (__m128i*)&l0[idx0 & MASK];                              \
+		int64_t u = ((int64_t*)ptr0)[0];                                \
+		int32_t d = ((int32_t*)ptr0)[2];                                \
+		int64_t q = u / (d | 0x5);                                      \
+                                                                        \
+		((int64_t*)ptr0)[0] = u ^ q;                                    \
+		idx0 = d ^ q;                                                   \
+	}                                                                   \
 	else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \
-	{ \
-		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-		int64_t u  = ((int64_t*)ptr0)[0]; \
-		int32_t d  = ((int32_t*)ptr0)[2]; \
-		int64_t q = u / (d | 0x5); \
-		\
-		((int64_t*)ptr0)[0] = u ^ q; \
-		idx0 = (~d) ^ q; \
+	{                                                                   \
+		ptr0 = (__m128i*)&l0[idx0 & MASK];                              \
+		int64_t u = ((int64_t*)ptr0)[0];                                \
+		int32_t d = ((int32_t*)ptr0)[2];                                \
+		int64_t q = u / (d | 0x5);                                      \
+                                                                        \
+		((int64_t*)ptr0)[0] = u ^ q;                                    \
+		idx0 = (~d) ^ q;                                                \
 	}
 
-#define CN_FINALIZE(n) \
-	/* Optim - 90% time boundary */ \
+#define CN_FINALIZE(n)                                                                                                 \
+	/* Optim - 90% time boundary */                                                                                    \
 	cn_implode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state, algo); \
-	/* Optim - 99% time boundary */ \
-	keccakf((uint64_t*)ctx[n]->hash_state, 24); \
+	/* Optim - 99% time boundary */                                                                                    \
+	keccakf((uint64_t*)ctx[n]->hash_state, 24);                                                                        \
 	extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
 
 //! defer the evaluation of an macro
 #ifndef _MSC_VER
-#	define CN_DEFER(...) __VA_ARGS__
+#define CN_DEFER(...) __VA_ARGS__
 #else
-#	define CN_EMPTY(...)
-#	define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
+#define CN_EMPTY(...)
+#define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
 #endif
 
 //! execute the macro f with the passed arguments
-#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__)
+#define CN_EXEC(f, ...) \
+	CN_DEFER(f)         \
+	(__VA_ARGS__)
 
 /** add append n to all arguments and keeps n as first argument
  *
@@ -904,22 +908,22 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
  * @endcode
  */
 #define CN_ENUM_0(n, ...) n
-#define CN_ENUM_1(n, x1) n, x1 ## n
-#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n
-#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n
-#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n
-#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n
-#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n
-#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n
-#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n
-#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n
-#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
-#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
-#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
-#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
-#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
-#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
-#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n, x16 ## n
+#define CN_ENUM_1(n, x1) n, x1##n
+#define CN_ENUM_2(n, x1, x2) n, x1##n, x2##n
+#define CN_ENUM_3(n, x1, x2, x3) n, x1##n, x2##n, x3##n
+#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1##n, x2##n, x3##n, x4##n
+#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1##n, x2##n, x3##n, x4##n, x5##n
+#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n
+#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n
+#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n
+#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n
+#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n
+#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n
+#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n
+#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n
+#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n
+#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n
+#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n, x16##n
 
 /** repeat a macro call multiple times
  *
@@ -933,21 +937,35 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
  * f(0, foo0, bar); f(1, foo1, bar1)
  * @endcode
  */
-#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__))
-#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__))
-#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__))
-#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__))
-#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__))
-
-template< size_t N>
+#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__))
+#define REPEAT_2(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__))
+#define REPEAT_3(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__))
+#define REPEAT_4(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__))
+#define REPEAT_5(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(4, __VA_ARGS__))
+
+template <size_t N>
 struct Cryptonight_hash;
 
-template< >
+template <>
 struct Cryptonight_hash<1>
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -971,12 +989,12 @@ struct Cryptonight_hash<1>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<2>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1000,12 +1018,12 @@ struct Cryptonight_hash<2>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<3>
 {
 	static constexpr size_t N = 3;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1029,12 +1047,12 @@ struct Cryptonight_hash<3>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<4>
 {
 	static constexpr size_t N = 4;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1058,12 +1076,12 @@ struct Cryptonight_hash<4>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<5>
 {
 	static constexpr size_t N = 5;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1087,26 +1105,25 @@ struct Cryptonight_hash<5>
 	}
 };
 
-extern "C" void  cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
-extern "C" void  cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
 extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
 
-
-template< size_t N, size_t asm_version>
+template <size_t N, size_t asm_version>
 struct Cryptonight_hash_asm
 {
-	template<xmrstak_algo_id ALGO>
+	template <xmrstak_algo_id ALGO>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		for(size_t i = 0; i < N; ++i)
 		{
-			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200);
 			cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo);
 		}
 		if(ALGO == cryptonight_r)
 		{
 			// API ATTRIBUTE is only required for cryptonight_r
-			typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx *ctx);
+			typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx * ctx);
 			for(size_t i = 0; i < N; ++i)
 				reinterpret_cast<cn_r_mainloop_fun>(ctx[0]->loop_fn)(ctx[i]); // use always loop_fn from ctx[0]!!
 		}
@@ -1126,19 +1143,19 @@ struct Cryptonight_hash_asm
 };
 
 // double hash with specialized asm only for intel
-template< >
+template <>
 struct Cryptonight_hash_asm<2, 0>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo_id ALGO>
+	template <xmrstak_algo_id ALGO>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const size_t MEM = algo.Mem();
 
 		for(size_t i = 0; i < N; ++i)
 		{
-			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200);
 			/* Optim - 99% time boundary */
 			cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo);
 		}
@@ -1167,89 +1184,90 @@ struct Cryptonight_hash_asm<2, 0>
 namespace
 {
 
-template<typename T, typename U>
+template <typename T, typename U>
 static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask)
 {
-    const uint8_t* p = reinterpret_cast<const uint8_t*>(src);
-
-    // Workaround for Visual Studio placing trampoline in debug builds.
-#   if defined(_MSC_VER)
-    if (p[0] == 0xE9) {
-        p += *(int32_t*)(p + 1) + 5;
-    }
-#   endif
-
-    size_t size = 0;
-    while (*(uint32_t*)(p + size) != 0xDEADC0DE) {
-        ++size;
-    }
-    size += sizeof(uint32_t);
-
-    memcpy((void*) dst, (const void*) src, size);
-
-    uint8_t* patched_data = reinterpret_cast<uint8_t*>(dst);
-    for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) {
-        switch (*(uint32_t*)(patched_data + i)) {
-        case CN_ITER:
-            *(uint32_t*)(patched_data + i) = iterations;
-            break;
-
-        case CN_MASK:
-            *(uint32_t*)(patched_data + i) = mask;
-            break;
-        }
-    }
-}
+	const uint8_t* p = reinterpret_cast<const uint8_t*>(src);
+
+	// Workaround for Visual Studio placing trampoline in debug builds.
+#if defined(_MSC_VER)
+	if(p[0] == 0xE9)
+	{
+		p += *(int32_t*)(p + 1) + 5;
+	}
+#endif
+
+	size_t size = 0;
+	while(*(uint32_t*)(p + size) != 0xDEADC0DE)
+	{
+		++size;
+	}
+	size += sizeof(uint32_t);
+
+	memcpy((void*)dst, (const void*)src, size);
+
+	uint8_t* patched_data = reinterpret_cast<uint8_t*>(dst);
+	for(size_t i = 0; i + sizeof(uint32_t) <= size; ++i)
+	{
+		switch(*(uint32_t*)(patched_data + i))
+		{
+		case CN_ITER:
+			*(uint32_t*)(patched_data + i) = iterations;
+			break;
 
+		case CN_MASK:
+			*(uint32_t*)(patched_data + i) = mask;
+			break;
+		}
+	}
+}
 
 void* allocateExecutableMemory(size_t size)
 {
 
 #ifdef _WIN64
-return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+	return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+#else
+#if defined(__APPLE__)
+	return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
 #else
-#   if defined(__APPLE__)
-    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
-#   else
-    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#   endif
+	return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#endif
 #endif
 }
 
-
-void protectExecutableMemory(void *p, size_t size)
+void protectExecutableMemory(void* p, size_t size)
 {
 #ifdef _WIN64
-    DWORD oldProtect;
-    VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect);
+	DWORD oldProtect;
+	VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect);
 #else
-    mprotect(p, size, PROT_READ | PROT_EXEC);
+	mprotect(p, size, PROT_READ | PROT_EXEC);
 #endif
 }
 
-void unprotectExecutableMemory(void *p, size_t size)
+void unprotectExecutableMemory(void* p, size_t size)
 {
 #ifdef _WIN64
-    DWORD oldProtect;
-    VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect);
+	DWORD oldProtect;
+	VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect);
 #else
-    mprotect(p, size, PROT_WRITE | PROT_EXEC);
+	mprotect(p, size, PROT_WRITE | PROT_EXEC);
 #endif
 }
 
-
-void flushInstructionCache(void *p, size_t size)
+void flushInstructionCache(void* p, size_t size)
 {
 #ifdef _WIN64
-    ::FlushInstructionCache(GetCurrentProcess(), p, size);
+	::FlushInstructionCache(GetCurrentProcess(), p, size);
 #else
-#   ifndef __FreeBSD__
-    __builtin___clear_cache(reinterpret_cast<char*>(p), reinterpret_cast<char*>(p) + size);
-#   endif
+#ifndef __FreeBSD__
+	__builtin___clear_cache(reinterpret_cast<char*>(p), reinterpret_cast<char*>(p) + size);
+#endif
 #endif
 }
 
-template<size_t N>
+template <size_t N>
 void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 {
 	const uint32_t Iter = algo.Iter();
@@ -1270,7 +1288,8 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr
 		if(N == 2)
 			src_code = reinterpret_cast<cn_mainloop_fun>(cryptonight_v8_double_mainloop_sandybridge_asm);
 		else
-			src_code = cryptonight_v8_mainloop_ivybridge_asm;;
+			src_code = cryptonight_v8_mainloop_ivybridge_asm;
+		;
 	}
 	// supports only 1 thread per hash
 	if(selected_asm == "amd_avx")
@@ -1295,19 +1314,17 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr
 		flushInstructionCache(ctx[0]->fun_data, allocation_size);
 	}
 }
-} // namespace (anonymous)
-
-
+} // namespace
 
 struct Cryptonight_hash_gpu
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		set_float_rounding_mode_nearest();
-		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		keccak((const uint8_t*)input, len, ctx[0]->hash_state, 200);
 		cn_explode_scratchpad_gpu<PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state, algo);
 
 		if(cngpu_check_avx2())
@@ -1321,16 +1338,15 @@ struct Cryptonight_hash_gpu
 	}
 };
 
-template<size_t N>
+template <size_t N>
 struct Cryptonight_R_generator
 {
-	template<xmrstak_algo_id ALGO>
+	template <xmrstak_algo_id ALGO>
 	static void cn_on_new_job(const xmrstak::miner_work& work, cryptonight_ctx** ctx)
 	{
 		if(ctx[0]->cn_r_ctx.height == work.iBlockHeight &&
 			ctx[0]->last_algo == POW(cryptonight_r) &&
-			reinterpret_cast<void*>(ctx[0]->hash_fn) == ctx[0]->fun_data
-		)
+			reinterpret_cast<void*>(ctx[0]->hash_fn) == ctx[0]->fun_data)
 			return;
 
 		ctx[0]->last_algo = POW(cryptonight_r);
@@ -1346,7 +1362,7 @@ struct Cryptonight_R_generator
 				ctx[0]->hash_fn = Cryptonight_hash_asm<N, 1u>::template hash<cryptonight_r>;
 		}
 
-		for(size_t i=1; i < N; i++)
+		for(size_t i = 1; i < N; i++)
 		{
 			ctx[i]->cn_r_ctx = ctx[0]->cn_r_ctx;
 			ctx[i]->loop_fn = ctx[0]->loop_fn;
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index a9d1c96fd..e35c7c7b8 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -23,19 +23,19 @@
 
 extern "C"
 {
-#include "c_groestl.h"
 #include "c_blake256.h"
+#include "c_groestl.h"
 #include "c_jh.h"
 #include "c_skein.h"
 }
-#include "xmrstak/backend/cryptonight.hpp"
 #include "cryptonight.h"
 #include "cryptonight_aesni.h"
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/console.hpp"
+#include <algorithm>
 #include <stdio.h>
 #include <stdlib.h>
-#include <algorithm>
 
 #ifdef __GNUC__
 #include <mm_malloc.h>
@@ -49,30 +49,35 @@ extern "C"
 
 #ifdef _WIN32
 #include <windows.h>
+// this comment avoid that clang format reorders the includes
 #include <ntsecapi.h>
 #else
-#include <sys/mman.h>
 #include <errno.h>
 #include <string.h>
+#include <sys/mman.h>
 #endif // _WIN32
 
-void do_blake_hash(const void* input, uint32_t len, char* output) {
+void do_blake_hash(const void* input, uint32_t len, char* output)
+{
 	blake256_hash((uint8_t*)output, (const uint8_t*)input, len);
 }
 
-void do_groestl_hash(const void* input, uint32_t len, char* output) {
+void do_groestl_hash(const void* input, uint32_t len, char* output)
+{
 	groestl((const uint8_t*)input, len * 8, (uint8_t*)output);
 }
 
-void do_jh_hash(const void* input, uint32_t len, char* output) {
+void do_jh_hash(const void* input, uint32_t len, char* output)
+{
 	jh_hash(32 * 8, (const uint8_t*)input, 8 * len, (uint8_t*)output);
 }
 
-void do_skein_hash(const void* input, uint32_t len, char* output) {
+void do_skein_hash(const void* input, uint32_t len, char* output)
+{
 	skein_hash(8 * 32, (const uint8_t*)input, 8 * len, (uint8_t*)output);
 }
 
-void (* const extra_hashes[4])(const void *, uint32_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+void (*const extra_hashes[4])(const void*, uint32_t, char*) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
 
 #ifdef _WIN32
 #include "xmrstak/misc/uac.hpp"
@@ -81,21 +86,21 @@ BOOL bRebootDesirable = FALSE; //If VirtualAlloc fails, suggest a reboot
 
 BOOL AddPrivilege(TCHAR* pszPrivilege)
 {
-	HANDLE           hToken;
+	HANDLE hToken;
 	TOKEN_PRIVILEGES tp;
-	BOOL             status;
+	BOOL status;
 
-	if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
+	if(!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
 		return FALSE;
 
-	if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid))
+	if(!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid))
 		return FALSE;
 
 	tp.PrivilegeCount = 1;
 	tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
 	status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
 
-	if (!status || (GetLastError() != ERROR_SUCCESS))
+	if(!status || (GetLastError() != ERROR_SUCCESS))
 		return FALSE;
 
 	CloseHandle(hToken);
@@ -107,19 +112,19 @@ BOOL AddLargePageRights()
 	HANDLE hToken;
 	PTOKEN_USER user = NULL;
 
-	if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE)
+	if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE)
 	{
 		TOKEN_ELEVATION Elevation;
 		DWORD cbSize = sizeof(TOKEN_ELEVATION);
 		BOOL bIsElevated = FALSE;
 
-		if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
+		if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
 			bIsElevated = Elevation.TokenIsElevated;
 
 		DWORD size = 0;
 		GetTokenInformation(hToken, TokenUser, NULL, 0, &size);
 
-		if (size > 0 && bIsElevated)
+		if(size > 0 && bIsElevated)
 		{
 			user = (PTOKEN_USER)LocalAlloc(LPTR, size);
 			GetTokenInformation(hToken, TokenUser, user, size, &size);
@@ -128,7 +133,7 @@ BOOL AddLargePageRights()
 		CloseHandle(hToken);
 	}
 
-	if (!user)
+	if(!user)
 		return FALSE;
 
 	LSA_HANDLE handle;
@@ -136,7 +141,7 @@ BOOL AddLargePageRights()
 	ZeroMemory(&attributes, sizeof(attributes));
 
 	BOOL result = FALSE;
-	if (LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0)
+	if(LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0)
 	{
 		LSA_UNICODE_STRING lockmem;
 		lockmem.Buffer = L"SeLockMemoryPrivilege";
@@ -146,11 +151,11 @@ BOOL AddLargePageRights()
 		PLSA_UNICODE_STRING rights = NULL;
 		ULONG cnt = 0;
 		BOOL bHasRights = FALSE;
-		if (LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0)
+		if(LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0)
 		{
-			for (size_t i = 0; i < cnt; i++)
+			for(size_t i = 0; i < cnt; i++)
 			{
-				if (rights[i].Length == lockmem.Length &&
+				if(rights[i].Length == lockmem.Length &&
 					memcmp(rights[i].Buffer, lockmem.Buffer, 42) == 0)
 				{
 					bHasRights = TRUE;
@@ -220,7 +225,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		ptr->ctx_info[0] = 0;
 		ptr->ctx_info[1] = 0;
 		if(ptr->long_state == NULL)
-			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str());
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte", std::to_string(hashMemSize).c_str());
 		return ptr;
 	}
 
@@ -250,7 +255,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 #else
 //http://man7.org/linux/man-pages/man2/mmap.2.html
 #if defined(__APPLE__)
-	ptr->long_state  = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
 #elif defined(__FreeBSD__)
 	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
@@ -261,7 +266,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 #else
 	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
-	if (ptr->long_state == MAP_FAILED)
+	if(ptr->long_state == MAP_FAILED)
 	{
 		// try without MAP_HUGETLB for crappy kernels
 		msg->warning = "mmap with HUGETLB failed, attempting without it (you should fix your kernel)";
@@ -270,7 +275,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 	}
 #endif
 
-	if (ptr->long_state == MAP_FAILED)
+	if(ptr->long_state == MAP_FAILED)
 	{
 		_mm_free(ptr);
 		msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'";
@@ -279,7 +284,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 
 	ptr->ctx_info[0] = 1;
 
-	if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM|MADV_WILLNEED) != 0)
+	if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM | MADV_WILLNEED) != 0)
 		msg->warning = "madvise failed";
 
 	ptr->ctx_info[1] = 0;
diff --git a/xmrstak/backend/cpu/crypto/groestl_tables.h b/xmrstak/backend/cpu/crypto/groestl_tables.h
index a23295c35..85dd25f3d 100644
--- a/xmrstak/backend/cpu/crypto/groestl_tables.h
+++ b/xmrstak/backend/cpu/crypto/groestl_tables.h
@@ -1,38 +1,6 @@
 #ifndef __tables_h
 #define __tables_h
 
-
-const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
-, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
-, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
-, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
-, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
-, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
-, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
-, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
-, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
-, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
-, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
-, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
-, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
-, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
-, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
-, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
-, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
-, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
-, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
-, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
-, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
-, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
-, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
-, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
-, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
-, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
-, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
-, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
-, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
-, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
-, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
-, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
+const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
 
 #endif /* __tables_h */
diff --git a/xmrstak/backend/cpu/crypto/hash.h b/xmrstak/backend/cpu/crypto/hash.h
index 2af330932..574581376 100644
--- a/xmrstak/backend/cpu/crypto/hash.h
+++ b/xmrstak/backend/cpu/crypto/hash.h
@@ -4,4 +4,9 @@
 
 typedef unsigned char BitSequence;
 typedef uint32_t DataLength;
-typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
+typedef enum
+{
+	SUCCESS = 0,
+	FAIL = 1,
+	BAD_HASHLEN = 2
+} HashReturn;
diff --git a/xmrstak/backend/cpu/crypto/int-util.h b/xmrstak/backend/cpu/crypto/int-util.h
index 8748976c1..393b4f3d2 100644
--- a/xmrstak/backend/cpu/crypto/int-util.h
+++ b/xmrstak/backend/cpu/crypto/int-util.h
@@ -12,43 +12,51 @@
 #if defined(_MSC_VER)
 #include <stdlib.h>
 
-static inline uint32_t rol32(uint32_t x, int r) {
+static inline uint32_t rol32(uint32_t x, int r)
+{
 	static_assert(sizeof(uint32_t) == sizeof(unsigned int), "this code assumes 32-bit integers");
 	return _rotl(x, r);
 }
 
-static inline uint64_t rol64(uint64_t x, int r) {
+static inline uint64_t rol64(uint64_t x, int r)
+{
 	return _rotl64(x, r);
 }
 
 #else
 
-static inline uint32_t rol32(uint32_t x, int r) {
+static inline uint32_t rol32(uint32_t x, int r)
+{
 	return (x << (r & 31)) | (x >> (-r & 31));
 }
 
-static inline uint64_t rol64(uint64_t x, int r) {
+static inline uint64_t rol64(uint64_t x, int r)
+{
 	return (x << (r & 63)) | (x >> (-r & 63));
 }
 
 #endif
 
-static inline uint64_t hi_dword(uint64_t val) {
+static inline uint64_t hi_dword(uint64_t val)
+{
 	return val >> 32;
 }
 
-static inline uint64_t lo_dword(uint64_t val) {
+static inline uint64_t lo_dword(uint64_t val)
+{
 	return val & 0xFFFFFFFF;
 }
 
-static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) {
+static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder)
+{
 	dividend |= ((uint64_t)*remainder) << 32;
 	*remainder = dividend % divisor;
 	return dividend / divisor;
 }
 
 // Long division with 2^32 base
-static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) {
+static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo)
+{
 	uint64_t dividend_dwords[4];
 	uint32_t remainder = 0;
 
@@ -65,30 +73,35 @@ static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uin
 	return remainder;
 }
 
-#define IDENT32(x) ((uint32_t) (x))
-#define IDENT64(x) ((uint64_t) (x))
+#define IDENT32(x) ((uint32_t)(x))
+#define IDENT64(x) ((uint64_t)(x))
 
-#define SWAP32(x) ((((uint32_t) (x) & 0x000000ff) << 24) | \
-  (((uint32_t) (x) & 0x0000ff00) <<  8) | \
-  (((uint32_t) (x) & 0x00ff0000) >>  8) | \
-  (((uint32_t) (x) & 0xff000000) >> 24))
-#define SWAP64(x) ((((uint64_t) (x) & 0x00000000000000ff) << 56) | \
-  (((uint64_t) (x) & 0x000000000000ff00) << 40) | \
-  (((uint64_t) (x) & 0x0000000000ff0000) << 24) | \
-  (((uint64_t) (x) & 0x00000000ff000000) <<  8) | \
-  (((uint64_t) (x) & 0x000000ff00000000) >>  8) | \
-  (((uint64_t) (x) & 0x0000ff0000000000) >> 24) | \
-  (((uint64_t) (x) & 0x00ff000000000000) >> 40) | \
-  (((uint64_t) (x) & 0xff00000000000000) >> 56))
+#define SWAP32(x) ((((uint32_t)(x)&0x000000ff) << 24) | \
+				   (((uint32_t)(x)&0x0000ff00) << 8) |  \
+				   (((uint32_t)(x)&0x00ff0000) >> 8) |  \
+				   (((uint32_t)(x)&0xff000000) >> 24))
+#define SWAP64(x) ((((uint64_t)(x)&0x00000000000000ff) << 56) | \
+				   (((uint64_t)(x)&0x000000000000ff00) << 40) | \
+				   (((uint64_t)(x)&0x0000000000ff0000) << 24) | \
+				   (((uint64_t)(x)&0x00000000ff000000) << 8) |  \
+				   (((uint64_t)(x)&0x000000ff00000000) >> 8) |  \
+				   (((uint64_t)(x)&0x0000ff0000000000) >> 24) | \
+				   (((uint64_t)(x)&0x00ff000000000000) >> 40) | \
+				   (((uint64_t)(x)&0xff00000000000000) >> 56))
 
-static inline uint32_t ident32(uint32_t x) { return x; }
+static inline uint32_t ident32(uint32_t x)
+{
+	return x;
+}
 static inline uint64_t ident64(uint64_t x) { return x; }
 
-static inline uint32_t swap32(uint32_t x) {
+static inline uint32_t swap32(uint32_t x)
+{
 	x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8);
 	return (x << 16) | (x >> 16);
 }
-static inline uint64_t swap64(uint64_t x) {
+static inline uint64_t swap64(uint64_t x)
+{
 	x = ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8);
 	x = ((x & 0x0000ffff0000ffff) << 16) | ((x & 0xffff0000ffff0000) >> 16);
 	return (x << 32) | (x >> 32);
@@ -99,39 +112,51 @@ static inline uint64_t swap64(uint64_t x) {
 #else
 #define UNUSED
 #endif
-static inline void mem_inplace_ident(void *mem UNUSED, size_t n UNUSED) { }
+static inline void mem_inplace_ident(void* mem UNUSED, size_t n UNUSED)
+{
+}
 #undef UNUSED
 
-static inline void mem_inplace_swap32(void *mem, size_t n) {
+static inline void mem_inplace_swap32(void* mem, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint32_t *)mem)[i] = swap32(((const uint32_t *)mem)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint32_t*)mem)[i] = swap32(((const uint32_t*)mem)[i]);
 	}
 }
-static inline void mem_inplace_swap64(void *mem, size_t n) {
+static inline void mem_inplace_swap64(void* mem, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint64_t *)mem)[i] = swap64(((const uint64_t *)mem)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint64_t*)mem)[i] = swap64(((const uint64_t*)mem)[i]);
 	}
 }
 
-static inline void memcpy_ident32(void *dst, const void *src, size_t n) {
+static inline void memcpy_ident32(void* dst, const void* src, size_t n)
+{
 	memcpy(dst, src, 4 * n);
 }
-static inline void memcpy_ident64(void *dst, const void *src, size_t n) {
+static inline void memcpy_ident64(void* dst, const void* src, size_t n)
+{
 	memcpy(dst, src, 8 * n);
 }
 
-static inline void memcpy_swap32(void *dst, const void *src, size_t n) {
+static inline void memcpy_swap32(void* dst, const void* src, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint32_t *)dst)[i] = swap32(((const uint32_t *)src)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint32_t*)dst)[i] = swap32(((const uint32_t*)src)[i]);
 	}
 }
-static inline void memcpy_swap64(void *dst, const void *src, size_t n) {
+static inline void memcpy_swap64(void* dst, const void* src, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint64_t *)dst)[i] = swap64(((const uint64_t *)src)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint64_t*)dst)[i] = swap64(((const uint64_t*)src)[i]);
 	}
 }
 
diff --git a/xmrstak/backend/cpu/crypto/skein_port.h b/xmrstak/backend/cpu/crypto/skein_port.h
index 99641bcdf..1648cdc7d 100644
--- a/xmrstak/backend/cpu/crypto/skein_port.h
+++ b/xmrstak/backend/cpu/crypto/skein_port.h
@@ -2,38 +2,38 @@
 #define _SKEIN_PORT_H_
 
 #include <limits.h>
-#include <stdint.h>
 #include <stddef.h>
+#include <stdint.h>
 
 #ifndef RETURN_VALUES
-#  define RETURN_VALUES
-#  if defined( DLL_EXPORT )
-#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
-#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
-#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
-#    elif defined( __GNUC__ )
-#      define VOID_RETURN    __declspec( __dllexport__ ) void
-#      define INT_RETURN     __declspec( __dllexport__ ) int
-#    else
-#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
-#    endif
-#  elif defined( DLL_IMPORT )
-#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
-#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
-#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
-#    elif defined( __GNUC__ )
-#      define VOID_RETURN    __declspec( __dllimport__ ) void
-#      define INT_RETURN     __declspec( __dllimport__ ) int
-#    else
-#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
-#    endif
-#  elif defined( __WATCOMC__ )
-#    define VOID_RETURN  void __cdecl
-#    define INT_RETURN   int  __cdecl
-#  else
-#    define VOID_RETURN  void
-#    define INT_RETURN   int
-#  endif
+#define RETURN_VALUES
+#if defined(DLL_EXPORT)
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#define VOID_RETURN __declspec(dllexport) void __stdcall
+#define INT_RETURN __declspec(dllexport) int __stdcall
+#elif defined(__GNUC__)
+#define VOID_RETURN __declspec(__dllexport__) void
+#define INT_RETURN __declspec(__dllexport__) int
+#else
+#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#endif
+#elif defined(DLL_IMPORT)
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#define VOID_RETURN __declspec(dllimport) void __stdcall
+#define INT_RETURN __declspec(dllimport) int __stdcall
+#elif defined(__GNUC__)
+#define VOID_RETURN __declspec(__dllimport__) void
+#define INT_RETURN __declspec(__dllimport__) int
+#else
+#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#endif
+#elif defined(__WATCOMC__)
+#define VOID_RETURN void __cdecl
+#define INT_RETURN int __cdecl
+#else
+#define VOID_RETURN void
+#define INT_RETURN int
+#endif
 #endif
 
 /*  These defines are used to declare buffers in a way that allows
@@ -52,17 +52,17 @@
 								variable of length 'size' bits
 */
 
-#define ui_type(size)               uint##size##_t
-#define dec_unit_type(size,x)       typedef ui_type(size) x
-#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
-#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+#define ui_type(size) uint##size##_t
+#define dec_unit_type(size, x) typedef ui_type(size) x
+#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x, size) ((ui_type(size)*)(x))
 
-typedef unsigned int    uint_t;             /* native unsigned integer */
-typedef uint8_t         u08b_t;             /*  8-bit unsigned integer */
-typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
+typedef unsigned int uint_t; /* native unsigned integer */
+typedef uint8_t u08b_t;		 /*  8-bit unsigned integer */
+typedef uint64_t u64b_t;	 /* 64-bit unsigned integer */
 
 #ifndef RotL_64
-#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64 - (N))))
 #endif
 
 /*
@@ -91,26 +91,25 @@ typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
 /* special handler for IA64, which may be either endianness (?)  */
 /* here we assume little-endian, but this may need to be changed */
 #if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
-#  define PLATFORM_MUST_ALIGN (1)
+#define PLATFORM_MUST_ALIGN (1)
 #ifndef PLATFORM_BYTE_ORDER
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #endif
 #endif
 
-#ifndef   PLATFORM_MUST_ALIGN
-#  define PLATFORM_MUST_ALIGN (0)
+#ifndef PLATFORM_MUST_ALIGN
+#define PLATFORM_MUST_ALIGN (0)
 #endif
 
-
-#if   PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
-	/* here for big-endian CPUs */
-#define SKEIN_NEED_SWAP   (1)
+#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+/* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP (1)
 #elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
-	/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
-#define SKEIN_NEED_SWAP   (0)
-#if   PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
-#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
-#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
+/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP (0)
+#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */
+#define Skein_Put64_LSB_First(dst08, src64, bCnt) memcpy(dst08, src64, bCnt)
+#define Skein_Get64_LSB_First(dst64, src08, wCnt) memcpy(dst64, src08, 8 * (wCnt))
 #endif
 #else
 #error "Skein needs endianness setting!"
@@ -123,57 +122,55 @@ typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
  *      Provide any definitions still needed.
  ******************************************************************
  */
-#ifndef Skein_Swap64  /* swap for big-endian, nop for little-endian */
-#if     SKEIN_NEED_SWAP
-#define Skein_Swap64(w64)                       \
-  ( (( ((u64b_t)(w64))       & 0xFF) << 56) |   \
-	(((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |   \
-	(((((u64b_t)(w64)) >>16) & 0xFF) << 40) |   \
-	(((((u64b_t)(w64)) >>24) & 0xFF) << 32) |   \
-	(((((u64b_t)(w64)) >>32) & 0xFF) << 24) |   \
-	(((((u64b_t)(w64)) >>40) & 0xFF) << 16) |   \
-	(((((u64b_t)(w64)) >>48) & 0xFF) <<  8) |   \
-	(((((u64b_t)(w64)) >>56) & 0xFF)      ) )
+#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */
+#if SKEIN_NEED_SWAP
+#define Skein_Swap64(w64)                          \
+	(((((u64b_t)(w64)) & 0xFF) << 56) |            \
+		(((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |  \
+		(((((u64b_t)(w64)) >> 16) & 0xFF) << 40) | \
+		(((((u64b_t)(w64)) >> 24) & 0xFF) << 32) | \
+		(((((u64b_t)(w64)) >> 32) & 0xFF) << 24) | \
+		(((((u64b_t)(w64)) >> 40) & 0xFF) << 16) | \
+		(((((u64b_t)(w64)) >> 48) & 0xFF) << 8) |  \
+		(((((u64b_t)(w64)) >> 56) & 0xFF)))
 #else
-#define Skein_Swap64(w64)  (w64)
+#define Skein_Swap64(w64) (w64)
 #endif
-#endif  /* ifndef Skein_Swap64 */
-
+#endif /* ifndef Skein_Swap64 */
 
 #ifndef Skein_Put64_LSB_First
-void    Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
-#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
-	{ /* this version is fully portable (big-endian or little-endian), but slow */
+void Skein_Put64_LSB_First(u08b_t* dst, const u64b_t* src, size_t bCnt)
+#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
+{					   /* this version is fully portable (big-endian or little-endian), but slow */
 	size_t n;
 
-	for (n=0;n<bCnt;n++)
-		dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
-	}
+	for(n = 0; n < bCnt; n++)
+		dst[n] = (u08b_t)(src[n >> 3] >> (8 * (n & 7)));
+}
 #else
-	;    /* output only the function prototype */
+	; /* output only the function prototype */
 #endif
-#endif   /* ifndef Skein_Put64_LSB_First */
-
+#endif /* ifndef Skein_Put64_LSB_First */
 
 #ifndef Skein_Get64_LSB_First
-void    Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
-#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
-	{ /* this version is fully portable (big-endian or little-endian), but slow */
+void Skein_Get64_LSB_First(u64b_t* dst, const u08b_t* src, size_t wCnt)
+#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
+{					   /* this version is fully portable (big-endian or little-endian), but slow */
 	size_t n;
 
-	for (n=0;n<8*wCnt;n+=8)
-		dst[n/8] = (((u64b_t) src[n  ])      ) +
-				   (((u64b_t) src[n+1]) <<  8) +
-				   (((u64b_t) src[n+2]) << 16) +
-				   (((u64b_t) src[n+3]) << 24) +
-				   (((u64b_t) src[n+4]) << 32) +
-				   (((u64b_t) src[n+5]) << 40) +
-				   (((u64b_t) src[n+6]) << 48) +
-				   (((u64b_t) src[n+7]) << 56) ;
-	}
+	for(n = 0; n < 8 * wCnt; n += 8)
+		dst[n / 8] = (((u64b_t)src[n])) +
+					 (((u64b_t)src[n + 1]) << 8) +
+					 (((u64b_t)src[n + 2]) << 16) +
+					 (((u64b_t)src[n + 3]) << 24) +
+					 (((u64b_t)src[n + 4]) << 32) +
+					 (((u64b_t)src[n + 5]) << 40) +
+					 (((u64b_t)src[n + 6]) << 48) +
+					 (((u64b_t)src[n + 7]) << 56);
+}
 #else
-	;    /* output only the function prototype */
+	; /* output only the function prototype */
 #endif
-#endif   /* ifndef Skein_Get64_LSB_First */
+#endif /* ifndef Skein_Get64_LSB_First */
 
-#endif   /* ifndef _SKEIN_PORT_H_ */
+#endif /* ifndef _SKEIN_PORT_H_ */
diff --git a/xmrstak/backend/cpu/crypto/soft_aes.hpp b/xmrstak/backend/cpu/crypto/soft_aes.hpp
index 9b4ae0ab5..3ea75c5e6 100644
--- a/xmrstak/backend/cpu/crypto/soft_aes.hpp
+++ b/xmrstak/backend/cpu/crypto/soft_aes.hpp
@@ -34,56 +34,58 @@
 
 #include <inttypes.h>
 
-#define saes_data(w) {\
-	w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
-	w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
-	w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
-	w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
-	w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
-	w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
-	w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
-	w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
-	w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
-	w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
-	w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
-	w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
-	w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
-	w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
-	w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
-	w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
-	w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
-	w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
-	w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
-	w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
-	w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
-	w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
-	w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
-	w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
-	w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
-	w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
-	w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
-	w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
-	w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
-	w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
-	w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
-	w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+#define saes_data(w)                                                                \
+	{                                                                               \
+		w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),     \
+			w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76), \
+			w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0), \
+			w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0), \
+			w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), \
+			w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), \
+			w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+			w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75), \
+			w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0), \
+			w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84), \
+			w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b), \
+			w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), \
+			w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), \
+			w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+			w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5), \
+			w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2), \
+			w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17), \
+			w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73), \
+			w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), \
+			w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), \
+			w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+			w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79), \
+			w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9), \
+			w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08), \
+			w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6), \
+			w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), \
+			w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), \
+			w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+			w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94), \
+			w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf), \
+			w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68), \
+			w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16)  \
+	}
 
-#define SAES_WPOLY           0x011b
+#define SAES_WPOLY 0x011b
 
 #define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
-	((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+								  ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
 
-#define saes_f2(x)   ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY))
-#define saes_f3(x)   (saes_f2(x) ^ x)
-#define saes_h0(x)   (x)
+#define saes_f2(x) ((x << 1) ^ (((x >> 7) & 1) * SAES_WPOLY))
+#define saes_f3(x) (saes_f2(x) ^ x)
+#define saes_h0(x) (x)
 
-#define saes_u0(p)   saes_b2w(saes_f2(p),          p,          p, saes_f3(p))
-#define saes_u1(p)   saes_b2w(saes_f3(p), saes_f2(p),          p,          p)
-#define saes_u2(p)   saes_b2w(         p, saes_f3(p), saes_f2(p),          p)
-#define saes_u3(p)   saes_b2w(         p,          p, saes_f3(p), saes_f2(p))
+#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p))
+#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p)
+#define saes_u2(p) saes_b2w(p, saes_f3(p), saes_f2(p), p)
+#define saes_u3(p) saes_b2w(p, p, saes_f3(p), saes_f2(p))
 
-alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
-alignas(16) const uint8_t  saes_sbox[256] = saes_data(saes_h0);
+alignas(16) const uint32_t saes_table[4][256] = {saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3)};
+alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0);
 
 static inline __m128i soft_aesenc(__m128i in, __m128i key)
 {
@@ -104,10 +106,10 @@ static inline __m128i soft_aesenc(__m128i in, __m128i key)
 
 static inline uint32_t sub_word(uint32_t key)
 {
-	return (saes_sbox[key >> 24 ] << 24)   |
-		(saes_sbox[(key >> 16) & 0xff] << 16 ) |
-		(saes_sbox[(key >> 8)  & 0xff] << 8  ) |
-		 saes_sbox[key & 0xff];
+	return (saes_sbox[key >> 24] << 24) |
+		   (saes_sbox[(key >> 16) & 0xff] << 16) |
+		   (saes_sbox[(key >> 8) & 0xff] << 8) |
+		   saes_sbox[key & 0xff];
 }
 
 #ifdef __clang__
@@ -121,5 +123,5 @@ static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
 {
 	uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)));
 	uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)));
-	return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
+	return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1);
 }
diff --git a/xmrstak/backend/cpu/crypto/variant4_random_math.h b/xmrstak/backend/cpu/crypto/variant4_random_math.h
index 50228adf2..9fe61db51 100644
--- a/xmrstak/backend/cpu/crypto/variant4_random_math.h
+++ b/xmrstak/backend/cpu/crypto/variant4_random_math.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include <string.h>
 #include "../../cryptonight.hpp"
 #include "xmrstak/misc/console.hpp"
+#include <string.h>
 
 extern "C"
 {
-    #include "c_blake256.h"
+#include "c_blake256.h"
 }
 
 enum V4_Settings
@@ -31,13 +31,13 @@ enum V4_Settings
 
 enum V4_InstructionList
 {
-	MUL,	// a*b
-	ADD,	// a+b + C, C is an unsigned 32-bit constant
-	SUB,	// a-b
-	ROR,	// rotate right "a" by "b & 31" bits
-	ROL,	// rotate left "a" by "b & 31" bits
-	XOR,	// a^b
-	RET,	// finish execution
+	MUL, // a*b
+	ADD, // a+b + C, C is an unsigned 32-bit constant
+	SUB, // a-b
+	ROR, // rotate right "a" by "b & 31" bits
+	ROL, // rotate left "a" by "b & 31" bits
+	XOR, // a^b
+	RET, // finish execution
 	V4_INSTRUCTION_COUNT = RET,
 };
 
@@ -87,7 +87,7 @@ struct V4_Instruction
 // every switch-case will point to the same destination on every iteration of Cryptonight main loop
 //
 // This is about as fast as it can get without using low-level machine code generation
-template<typename v4_reg>
+template <typename v4_reg>
 static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 {
 	enum
@@ -95,55 +95,55 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 		REG_BITS = sizeof(v4_reg) * 8,
 	};
 
-#define V4_EXEC(i) \
-	{ \
-		const struct V4_Instruction* op = code + i; \
-		const v4_reg src = r[op->src_index]; \
-		v4_reg* dst = r + op->dst_index; \
-		switch (op->opcode) \
-		{ \
-		case MUL: \
-			*dst *= src; \
-			break; \
-		case ADD: \
-			*dst += src + op->C; \
-			break; \
-		case SUB: \
-			*dst -= src; \
-			break; \
-		case ROR: \
-			{ \
-				const uint32_t shift = src % REG_BITS; \
-				*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
-			} \
-			break; \
-		case ROL: \
-			{ \
-				const uint32_t shift = src % REG_BITS; \
-				*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
-			} \
-			break; \
-		case XOR: \
-			*dst ^= src; \
-			break; \
-		case RET: \
-			return; \
-		default: \
-			UNREACHABLE_CODE; \
-			break; \
-		} \
+#define V4_EXEC(i)                                                              \
+	{                                                                           \
+		const struct V4_Instruction* op = code + i;                             \
+		const v4_reg src = r[op->src_index];                                    \
+		v4_reg* dst = r + op->dst_index;                                        \
+		switch(op->opcode)                                                      \
+		{                                                                       \
+		case MUL:                                                               \
+			*dst *= src;                                                        \
+			break;                                                              \
+		case ADD:                                                               \
+			*dst += src + op->C;                                                \
+			break;                                                              \
+		case SUB:                                                               \
+			*dst -= src;                                                        \
+			break;                                                              \
+		case ROR:                                                               \
+		{                                                                       \
+			const uint32_t shift = src % REG_BITS;                              \
+			*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+		}                                                                       \
+		break;                                                                  \
+		case ROL:                                                               \
+		{                                                                       \
+			const uint32_t shift = src % REG_BITS;                              \
+			*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+		}                                                                       \
+		break;                                                                  \
+		case XOR:                                                               \
+			*dst ^= src;                                                        \
+			break;                                                              \
+		case RET:                                                               \
+			return;                                                             \
+		default:                                                                \
+			UNREACHABLE_CODE;                                                   \
+			break;                                                              \
+		}                                                                       \
 	}
 
 #define V4_EXEC_10(j) \
-	V4_EXEC(j + 0) \
-	V4_EXEC(j + 1) \
-	V4_EXEC(j + 2) \
-	V4_EXEC(j + 3) \
-	V4_EXEC(j + 4) \
-	V4_EXEC(j + 5) \
-	V4_EXEC(j + 6) \
-	V4_EXEC(j + 7) \
-	V4_EXEC(j + 8) \
+	V4_EXEC(j + 0)    \
+	V4_EXEC(j + 1)    \
+	V4_EXEC(j + 2)    \
+	V4_EXEC(j + 3)    \
+	V4_EXEC(j + 4)    \
+	V4_EXEC(j + 5)    \
+	V4_EXEC(j + 6)    \
+	V4_EXEC(j + 7)    \
+	V4_EXEC(j + 8)    \
 	V4_EXEC(j + 9)
 
 	// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
@@ -161,13 +161,13 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 	// 69      102
 
 	// Unroll 70 instructions here
-	V4_EXEC_10(0);		// instructions 0-9
-	V4_EXEC_10(10);		// instructions 10-19
-	V4_EXEC_10(20);		// instructions 20-29
-	V4_EXEC_10(30);		// instructions 30-39
-	V4_EXEC_10(40);		// instructions 40-49
-	V4_EXEC_10(50);		// instructions 50-59
-	V4_EXEC_10(60);		// instructions 60-69
+	V4_EXEC_10(0);  // instructions 0-9
+	V4_EXEC_10(10); // instructions 10-19
+	V4_EXEC_10(20); // instructions 20-29
+	V4_EXEC_10(30); // instructions 30-39
+	V4_EXEC_10(40); // instructions 40-49
+	V4_EXEC_10(50); // instructions 50-59
+	V4_EXEC_10(60); // instructions 60-69
 
 #undef V4_EXEC_10
 #undef V4_EXEC
@@ -176,7 +176,7 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 // If we don't have enough data available, generate more
 static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
 {
-	if (*data_index + bytes_needed > data_size)
+	if(*data_index + bytes_needed > data_size)
 	{
 		blake256_hash((uint8_t*)data, (uint8_t*)data, data_size);
 		*data_index = 0;
@@ -188,7 +188,7 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed
 
 // Generates as many random math operations as possible with given latency and ALU restrictions
 // "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
 {
 	printer::inst()->print_msg(LDEBUG, "CryptonightR create random math for block %llu", height);
@@ -199,13 +199,13 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 	// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
 	// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
 	// Source: https://www.agner.org/optimize/instruction_tables.pdf
-	const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+	const int op_latency[V4_INSTRUCTION_COUNT] = {3, 2, 1, 2, 2, 1};
 
 	// Instruction latencies for theoretical ASIC implementation
-	const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+	const int asic_op_latency[V4_INSTRUCTION_COUNT] = {3, 1, 1, 1, 1, 1};
 
 	// Available ALUs for each instruction
-	const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+	const int op_ALUs[V4_INSTRUCTION_COUNT] = {ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT};
 
 	int8_t data[32];
 	memset(data, 0, sizeof(data));
@@ -226,7 +226,8 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 	// There is a small chance (1.8%) that register R8 won't be used in the generated program
 	// So we keep track of it and try again if it's not used
 	bool r8_used;
-	do {
+	do
+	{
 		int latency[9];
 		int asic_latency[9];
 
@@ -237,7 +238,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 		//
 		// Registers R4-R8 are constant and are treated as having the same value because when we do
 		// the same operation twice with two constant source registers, it can be optimized into a single operation
-		uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+		uint32_t inst_data[9] = {0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF};
 
 		bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
 		bool is_rotation[V4_INSTRUCTION_COUNT];
@@ -260,11 +261,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 
 		// Generate random code to achieve minimal required latency for our abstract CPU
 		// Try to get this latency for all 4 registers
-		while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+		while(((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
 		{
 			// Fail-safe to guarantee loop termination
 			++total_iterations;
-			if (total_iterations > 256)
+			if(total_iterations > 256)
 				break;
 
 			check_data(&data_index, 1, data, sizeof(data));
@@ -277,12 +278,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			// ROR/ROL = opcode 5, shift direction is selected randomly
 			// XOR = opcodes 6-7
 			uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
-			if (opcode == 5)
+			if(opcode == 5)
 			{
 				check_data(&data_index, 1, data, sizeof(data));
 				opcode = (data[data_index++] >= 0) ? ROR : ROL;
 			}
-			else if (opcode >= 6)
+			else if(opcode >= 6)
 			{
 				opcode = XOR;
 			}
@@ -298,7 +299,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			int b = src_index;
 
 			// Don't do ADD/SUB/XOR with the same register
-			if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+			if(((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
 			{
 				// a is always < 4, so we don't need to check bounds here
 				b = (ALGO == cryptonight_r_wow) ? (a + 4) : 8;
@@ -306,7 +307,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			}
 
 			// Don't do rotation with the same destination twice because it's equal to a single rotation
-			if (is_rotation[opcode] && rotated[a])
+			if(is_rotation[opcode] && rotated[a])
 			{
 				continue;
 			}
@@ -314,7 +315,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
 			// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
 			// 2xXOR(a, b) = NOP
-			if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+			if((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
 			{
 				continue;
 			}
@@ -322,20 +323,20 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			// Find which ALU is available (and when) for this instruction
 			int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
 			int alu_index = -1;
-			while (next_latency < TOTAL_LATENCY)
+			while(next_latency < TOTAL_LATENCY)
 			{
-				for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+				for(int i = op_ALUs[opcode] - 1; i >= 0; --i)
 				{
-					if (!alu_busy[next_latency][i])
+					if(!alu_busy[next_latency][i])
 					{
 						// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
-						if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+						if((opcode == ADD) && alu_busy[next_latency + 1][i])
 						{
 							continue;
 						}
 
 						// Rotation can only start when previous rotation is finished, so do an additional availability check
-						if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+						if(is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
 						{
 							continue;
 						}
@@ -344,7 +345,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 						break;
 					}
 				}
-				if (alu_index >= 0)
+				if(alu_index >= 0)
 				{
 					break;
 				}
@@ -352,16 +353,16 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			}
 
 			// Don't generate instructions that leave some register unchanged for more than 7 cycles
-			if (next_latency > latency[a] + 7)
+			if(next_latency > latency[a] + 7)
 			{
 				continue;
 			}
 
 			next_latency += op_latency[opcode];
 
-			if (next_latency <= TOTAL_LATENCY)
+			if(next_latency <= TOTAL_LATENCY)
 			{
-				if (is_rotation[opcode])
+				if(is_rotation[opcode])
 				{
 					++rotate_count;
 				}
@@ -382,12 +383,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 				code[code_size].src_index = src_index;
 				code[code_size].C = 0;
 
-				if (src_index == 8)
+				if(src_index == 8)
 				{
 					r8_used = true;
 				}
 
-				if (opcode == ADD)
+				if(opcode == ADD)
 				{
 					// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
 					alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
@@ -401,7 +402,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 				}
 
 				++code_size;
-				if (code_size >= NUM_INSTRUCTIONS_MIN)
+				if(code_size >= NUM_INSTRUCTIONS_MIN)
 				{
 					break;
 				}
@@ -416,17 +417,19 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 		// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
 		// Get this latency for at least 1 of the 4 registers
 		const int prev_code_size = code_size;
-		while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+		while((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
 		{
 			int min_idx = 0;
 			int max_idx = 0;
-			for (int i = 1; i < 4; ++i)
+			for(int i = 1; i < 4; ++i)
 			{
-				if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
-				if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+				if(asic_latency[i] < asic_latency[min_idx])
+					min_idx = i;
+				if(asic_latency[i] > asic_latency[max_idx])
+					max_idx = i;
 			}
 
-			const uint8_t pattern[3] = { ROR, MUL, MUL };
+			const uint8_t pattern[3] = {ROR, MUL, MUL};
 			const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
 			latency[min_idx] = latency[max_idx] + op_latency[opcode];
 			asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
@@ -438,9 +441,9 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			++code_size;
 		}
 
-	// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
-	// It never does more than 4 iterations for all block heights < 10,000,000
-	}  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+		// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+		// It never does more than 4 iterations for all block heights < 10,000,000
+	} while(!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
 
 	// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
 	// Add final instruction to stop the interpreter
diff --git a/xmrstak/backend/cpu/hwlocMemory.cpp b/xmrstak/backend/cpu/hwlocMemory.cpp
index 089570fc0..067f27975 100644
--- a/xmrstak/backend/cpu/hwlocMemory.cpp
+++ b/xmrstak/backend/cpu/hwlocMemory.cpp
@@ -6,6 +6,24 @@
 
 #include <hwloc.h>
 
+static __hwloc_inline int
+xmrstak_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+#if HWLOC_API_VERSION >= 0x20000
+	return hwloc_set_membind(
+		topology,
+		nodeset,
+		policy,
+		flags| HWLOC_MEMBIND_BYNODESET);
+#else
+	return hwloc_set_membind_nodeset(
+		topology,
+		nodeset,
+		policy,
+		flags);
+#endif
+}
+
 /** pin memory to NUMA node
  *
  * Set the default memory policy for the current thread to bind memory to the
@@ -13,7 +31,7 @@
  *
  * @param puId core id
  */
-void bindMemoryToNUMANode( size_t puId )
+void bindMemoryToNUMANode(size_t puId)
 {
 	int depth;
 	hwloc_topology_t topology;
@@ -30,18 +48,18 @@ void bindMemoryToNUMANode( size_t puId )
 
 	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
 
-	for( uint32_t i = 0;
+	for(uint32_t i = 0;
 		i < hwloc_get_nbobjs_by_depth(topology, depth);
-		i++ )
+		i++)
 	{
 		hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i);
-		if(  pu->os_index == puId )
+		if(pu->os_index == puId)
 		{
-			if( 0 > hwloc_set_membind_nodeset(
-				topology,
-				pu->nodeset,
-				HWLOC_MEMBIND_BIND,
-				HWLOC_MEMBIND_THREAD))
+			if(0 > xmrstak_set_membind_nodeset(
+					   topology,
+					   pu->nodeset,
+					   HWLOC_MEMBIND_BIND,
+					   HWLOC_MEMBIND_THREAD))
 			{
 				printer::inst()->print_msg(L0, "hwloc: can't bind memory");
 			}
@@ -57,7 +75,7 @@ void bindMemoryToNUMANode( size_t puId )
 }
 #else
 
-void bindMemoryToNUMANode( size_t )
+void bindMemoryToNUMANode(size_t)
 {
 }
 
diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp
index 2130c2ced..42fa3456f 100644
--- a/xmrstak/backend/cpu/hwlocMemory.hpp
+++ b/xmrstak/backend/cpu/hwlocMemory.hpp
@@ -9,4 +9,4 @@
  *
  * @param puId core id
  */
-void bindMemoryToNUMANode( size_t puId );
+void bindMemoryToNUMANode(size_t puId);
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index a14be1732..a7bb91d61 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -37,7 +37,6 @@
 #include <cpuid.h>
 #endif
 
-
 namespace xmrstak
 {
 namespace cpu
@@ -48,9 +47,14 @@ using namespace rapidjson;
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum { aCpuThreadsConf, sUseSlowMem };
+enum configEnum
+{
+	aCpuThreadsConf,
+	sUseSlowMem
+};
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -59,10 +63,9 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aCpuThreadsConf, "cpu_threads_conf", kNullType }
-};
+	{aCpuThreadsConf, "cpu_threads_conf", kNullType}};
 
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
 inline bool checkType(Type have, Type want)
 {
@@ -95,7 +98,7 @@ jconf::jconf()
 	prv = new opaque_private();
 }
 
-bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
+bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg)
 {
 	if(!prv->configValues[aCpuThreadsConf]->IsArray())
 		return false;
@@ -148,7 +151,6 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	return true;
 }
 
-
 size_t jconf::GetThreadCount()
 {
 	if(prv->configValues[aCpuThreadsConf]->IsArray())
@@ -159,22 +161,22 @@ size_t jconf::GetThreadCount()
 
 bool jconf::parse_config(const char* sFilename)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -189,7 +191,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -211,7 +213,7 @@ bool jconf::parse_config(const char* sFilename)
 	buffer[flen] = '}';
 	buffer[flen + 1] = '\0';
 
-	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	prv->jsonDoc.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(prv->jsonDoc.HasParseError())
@@ -251,7 +253,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	thd_cfg c;
-	for(size_t i=0; i < GetThreadCount(); i++)
+	for(size_t i = 0; i < GetThreadCount(); i++)
 	{
 		if(!GetThreadConfig(i, c))
 		{
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index 4ec9165d5..67dbd0275 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -12,16 +12,18 @@ namespace cpu
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
-		if (oInst == nullptr) oInst = new jconf;
+		if(oInst == nullptr)
+			oInst = new jconf;
 		return oInst;
 	};
 
 	bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str());
 
-	struct thd_cfg {
+	struct thd_cfg
+	{
 		int iMultiway;
 		bool bNoPrefetch;
 		std::string asm_version_str;
@@ -29,10 +31,10 @@ class jconf
 	};
 
 	size_t GetThreadCount();
-	bool GetThreadConfig(size_t id, thd_cfg &cfg);
+	bool GetThreadConfig(size_t id, thd_cfg& cfg);
 	bool NeedsAutoconf();
 
-private:
+  private:
 	jconf();
 	static jconf* oInst;
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index e90b59500..43759776f 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -23,33 +23,34 @@
 
 #include "crypto/cryptonight_aesni.h"
 
-#include "xmrstak/misc/console.hpp"
-#include "xmrstak/backend/iBackend.hpp"
+#include "jconf.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include "xmrstak/backend/globalStates.hpp"
+#include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/misc/configEditor.hpp"
-#include "xmrstak/backend/cpu/cpuType.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
-#include "jconf.hpp"
 
-#include "xmrstak/misc/executor.hpp"
 #include "minethd.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/executor.hpp"
 
 #include "hwlocMemory.hpp"
 #include "xmrstak/backend/miner_work.hpp"
 
 #ifndef CONF_NO_HWLOC
-#   include "autoAdjustHwloc.hpp"
+#include "autoAdjustHwloc.hpp"
+#include "autoAdjust.hpp"
 #else
-#   include "autoAdjust.hpp"
+#include "autoAdjust.hpp"
 #endif
 
 #include <assert.h>
-#include <cmath>
+#include <bitset>
 #include <chrono>
+#include <cmath>
 #include <cstring>
 #include <thread>
-#include <bitset>
 #include <unordered_map>
 
 #ifdef _WIN32
@@ -58,9 +59,9 @@
 #include <pthread.h>
 
 #if defined(__APPLE__)
-#include <mach/thread_policy.h>
 #include <mach/thread_act.h>
-#define SYSCTL_CORE_COUNT   "machdep.cpu.core_count"
+#include <mach/thread_policy.h>
+#define SYSCTL_CORE_COUNT "machdep.cpu.core_count"
 #elif defined(__FreeBSD__)
 #include <pthread_np.h>
 #endif //__APPLE__
@@ -87,7 +88,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 	}
 #elif defined(__APPLE__)
 	thread_port_t mach_thread;
-	thread_affinity_policy_data_t policy = { static_cast<integer_t>(cpu_id) };
+	thread_affinity_policy_data_t policy = {static_cast<integer_t>(cpu_id)};
 	mach_thread = pthread_mach_thread_np(h);
 	return thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1) == KERN_SUCCESS;
 #elif defined(__FreeBSD__)
@@ -96,8 +97,8 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 	CPU_SET(cpu_id, &mn);
 	return pthread_setaffinity_np(h, sizeof(cpuset_t), &mn) == 0;
 #elif defined(__OpenBSD__)
-        printer::inst()->print_msg(L0,"WARNING: thread pinning is not supported under OPENBSD.");
-        return true;
+	printer::inst()->print_msg(L0, "WARNING: thread pinning is not supported under OPENBSD.");
+	return true;
 #else
 	cpu_set_t mn;
 	CPU_ZERO(&mn);
@@ -120,7 +121,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
 
-	switch (iMultiway)
+	switch(iMultiway)
 	{
 	case 5:
 		oWorkThd = std::thread(&minethd::penta_work_main, this);
@@ -150,13 +151,13 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 cryptonight_ctx* minethd::minethd_alloc_ctx()
 {
 	cryptonight_ctx* ctx;
-	alloc_msg msg = { 0 };
+	alloc_msg msg = {0};
 
-	switch (::jconf::inst()->GetSlowMemSetting())
+	switch(::jconf::inst()->GetSlowMemSetting())
 	{
 	case ::jconf::never_use:
 		ctx = cryptonight_alloc_ctx(1, 1, &msg);
-		if (ctx == NULL)
+		if(ctx == NULL)
 			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
 		else
 		{
@@ -170,7 +171,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
 
 	case ::jconf::no_mlck:
 		ctx = cryptonight_alloc_ctx(1, 0, &msg);
-		if (ctx == NULL)
+		if(ctx == NULL)
 			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
 		else
 		{
@@ -184,12 +185,12 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
 
 	case ::jconf::print_warning:
 		ctx = cryptonight_alloc_ctx(1, 1, &msg);
-		if (msg.warning != NULL)
+		if(msg.warning != NULL)
 			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
-		if (ctx == NULL)
+		if(ctx == NULL)
 			ctx = cryptonight_alloc_ctx(0, 0, NULL);
 
-		if (ctx != NULL)
+		if(ctx != NULL)
 		{
 			ctx->hash_fn = nullptr;
 			ctx->loop_fn = nullptr;
@@ -220,11 +221,11 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
 static constexpr size_t MAX_N = 5;
 bool minethd::self_test()
 {
-	alloc_msg msg = { 0 };
+	alloc_msg msg = {0};
 	size_t res;
 	bool fatal = false;
 
-	switch (::jconf::inst()->GetSlowMemSetting())
+	switch(::jconf::inst()->GetSlowMemSetting())
 	{
 	case ::jconf::never_use:
 		res = cryptonight_init(1, 1, &msg);
@@ -255,13 +256,13 @@ bool minethd::self_test()
 	if(res == 0 && fatal)
 		return false;
 
-	cryptonight_ctx *ctx[MAX_N] = {0};
-	for (int i = 0; i < MAX_N; i++)
+	cryptonight_ctx* ctx[MAX_N] = {0};
+	for(int i = 0; i < MAX_N; i++)
 	{
-		if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
+		if((ctx[i] = minethd_alloc_ctx()) == nullptr)
 		{
 			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
-			for (int j = 0; j < i; j++)
+			for(int j = 0; j < i; j++)
 				cryptonight_free_ctx(ctx[j]);
 			return false;
 		}
@@ -279,63 +280,68 @@ bool minethd::self_test()
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
 			minethd::cn_on_new_job dm;
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
 			func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+			bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+											 "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22",
+									 64) == 0;
 
 			func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+			bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+											 "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22",
+									 64) == 0;
 
 			func_multi_selector<3>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a testThis is a testThis is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05",
+									 96) == 0;
 
 			func_multi_selector<4>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05",
+									 128) == 0;
 
 			func_multi_selector<5>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05",
+									 160) == 0;
 		}
 		else if(algo == POW(cryptonight_lite))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+			bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+			bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_monero))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+			bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+			bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_monero_v8))
 		{
@@ -351,61 +357,61 @@ bool minethd::self_test()
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+			bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+			bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_ipbc))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_stellite))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_masari))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_heavy))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+			bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+			bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_haven))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+			bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+			bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_bittube2))
 		{
@@ -415,7 +421,7 @@ bool minethd::self_test()
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 
 			ctx[0]->hash_fn("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
+			bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
 
 			ctx[0]->hash_fn("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
@@ -427,29 +433,29 @@ bool minethd::self_test()
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
+			bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_gpu))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
+			bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
+			bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_conceal))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
 		}
-		else if (algo == POW(cryptonight_turtle))
+		else if(algo == POW(cryptonight_turtle))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
@@ -467,7 +473,7 @@ bool minethd::self_test()
 			work.iBlockHeight = 1806260;
 			set_job(work, ctx);
 			ctx[0]->hash_fn("\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0;
+			bResult = bResult && memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_v8_reversewaltz))
 		{
@@ -498,7 +504,7 @@ bool minethd::self_test()
 				"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
 	}
 
-	for (int i = 0; i < MAX_N; i++)
+	for(int i = 0; i < MAX_N; i++)
 		cryptonight_free_ctx(ctx[i]);
 
 	return bResult;
@@ -510,9 +516,23 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 
 	if(!configEditor::file_exist(params::inst().configFileCPU))
 	{
+#ifndef CONF_NO_HWLOC
+		autoAdjustHwloc adjustHwloc;
+		if(!adjustHwloc.printConfig())
+		{
+			autoAdjust adjust;
+			if(!adjust.printConfig())
+			{
+				return pvThreads;
+			}
+		}
+#else
 		autoAdjust adjust;
 		if(!adjust.printConfig())
+		{
 			return pvThreads;
+		}
+#endif
 	}
 
 	if(!jconf::inst()->parse_config())
@@ -520,14 +540,13 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 		win_exit();
 	}
 
-
 	//Launch the requested number of single and double threads, to distribute
 	//load evenly we need to alternate single and double threads
 	size_t i, n = jconf::inst()->GetThreadCount();
 	pvThreads.reserve(n);
 
 	jconf::thd_cfg cfg;
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		jconf::inst()->GetThreadConfig(i, cfg);
 
@@ -572,11 +591,11 @@ static std::string getAsmName(const uint32_t num_hashes)
 	return asm_type;
 }
 
-template<size_t N>
+template <size_t N>
 void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& on_new_job,
 	bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str)
 {
-	static_assert(N >= 1, "number of threads must be >= 1" );
+	static_assert(N >= 1, "number of threads must be >= 1");
 
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
@@ -717,21 +736,20 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, true>
-	};
+		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, true>};
 
 	std::bitset<2> digit;
 	digit.set(0, !bHaveAes);
 	digit.set(1, !bNoPrefetch);
 
-	ctx[0]->hash_fn = func_table[ algv << 2 | digit.to_ulong() ];
+	ctx[0]->hash_fn = func_table[algv << 2 | digit.to_ulong()];
 
 	// check for asm optimized version for cryptonight_v8
 	if(algo == cryptonight_monero_v8)
 	{
 		std::string selected_asm = asm_version_str;
 		if(selected_asm == "auto")
-				selected_asm = cpu::getAsmName(N);
+			selected_asm = cpu::getAsmName(N);
 
 		if(selected_asm != "off")
 		{
@@ -747,7 +765,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 	{
 		std::string selected_asm = asm_version_str;
 		if(selected_asm == "auto")
-				selected_asm = cpu::getAsmName(N);
+			selected_asm = cpu::getAsmName(N);
 		if(selected_asm == "off")
 		{
 			for(int h = 0; h < N; ++h)
@@ -769,7 +787,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 	};
 
 	auto it = on_new_job_map.find(algo.Id());
-	if (it != on_new_job_map.end())
+	if(it != on_new_job_map.end())
 		on_new_job = it->second;
 	else
 		on_new_job = nullptr;
@@ -806,18 +824,18 @@ void minethd::penta_work_main()
 	multiway_work_main<5u>();
 }
 
-template<size_t N>
-void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce)
+template <size_t N>
+void minethd::prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce)
 {
-	for (size_t i = 0; i < N; i++)
+	for(size_t i = 0; i < N; i++)
 	{
 		memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize);
-		if (i > 0)
+		if(i > 0)
 			piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39);
 	}
 }
 
-template<uint32_t N>
+template <uint32_t N>
 void minethd::multiway_work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
@@ -825,25 +843,26 @@ void minethd::multiway_work_main()
 
 	order_fix.set_value();
 	std::unique_lock<std::mutex> lck(thd_aff_set);
-	lck.release();
+	lck.unlock();
 	std::this_thread::yield();
 
-	cryptonight_ctx *ctx[MAX_N];
+	cryptonight_ctx* ctx[MAX_N];
 	uint64_t iCount = 0;
-	uint64_t *piHashVal[MAX_N];
-	uint32_t *piNonce[MAX_N];
+	uint64_t iLastCount = 0;
+	uint64_t* piHashVal[MAX_N];
+	uint32_t* piNonce[MAX_N];
 	uint8_t bHashOut[MAX_N * 32];
 	uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N];
 	uint32_t iNonce;
 	job_result res;
 
-	for (size_t i = 0; i < N; i++)
+	for(size_t i = 0; i < N; i++)
 	{
 		ctx[i] = minethd_alloc_ctx();
 		if(ctx[i] == nullptr)
 		{
 			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
-			for (int j = 0; j < i; j++)
+			for(int j = 0; j < i; j++)
 				cryptonight_free_ctx(ctx[j]);
 			win_exit(1);
 		}
@@ -863,15 +882,15 @@ void minethd::multiway_work_main()
 	size_t lastPoolId = 0;
 
 	func_multi_selector<N>(ctx, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
-	while (bQuit == 0)
+	while(bQuit == 0)
 	{
-		if (oWork.bStall)
+		if(oWork.bStall)
 		{
 			/*	We are stalled here because the executor didn't find a job for us yet,
 			either because of network latency, or a socket problem. Since we are
 			raison d'etre of this software it us sensible to just wait until we have something*/
 
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			globalStates::inst().consume_work(oWork, iJobNo);
@@ -908,13 +927,12 @@ void minethd::multiway_work_main()
 		if(on_new_job != nullptr)
 			on_new_job(oWork, ctx);
 
-		while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
-			if ((iCount++ & 0x7) == 0)  //Store stats every 8*N hashes
+			if((iCount++ & 0x7) == 0) //Store stats every 8*N hashes
 			{
-				uint64_t iStamp = get_timestamp_ms();
-				iHashCount.store(iCount * N, std::memory_order_relaxed);
-				iTimestamp.store(iStamp, std::memory_order_relaxed);
+				updateStats((iCount - iLastCount) * N, oWork.iPoolId);
+				iLastCount = iCount;
 			}
 
 			nonce_ctr -= N;
@@ -927,19 +945,18 @@ void minethd::multiway_work_main()
 					break;
 			}
 
-			for (size_t i = 0; i < N; i++)
+			for(size_t i = 0; i < N; i++)
 				*piNonce[i] = iNonce++;
 
 			ctx[0]->hash_fn(bWorkBlob, oWork.iWorkSize, bHashOut, ctx, miner_algo);
 
-			for (size_t i = 0; i < N; i++)
+			for(size_t i = 0; i < N; i++)
 			{
-				if (*piHashVal[i] < oWork.iTarget)
+				if(*piHashVal[i] < oWork.iTarget)
 				{
 					executor::inst()->push_event(
 						ex_event(job_result(oWork.sJobID, iNonce - N + i, bHashOut + 32 * i, iThreadNo, miner_algo),
-						oWork.iPoolId)
-					);
+							oWork.iPoolId));
 				}
 			}
 
@@ -950,7 +967,7 @@ void minethd::multiway_work_main()
 		prep_multiway_work<N>(bWorkBlob, piNonce);
 	}
 
-	for (int i = 0; i < N; i++)
+	for(int i = 0; i < N; i++)
 		cryptonight_free_ctx(ctx[i]);
 }
 
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 1e25f5d4f..a5201f37a 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -1,15 +1,15 @@
 #pragma once
 
-#include "xmrstak/jconf.hpp"
 #include "crypto/cryptonight.h"
-#include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/backend/miner_work.hpp"
+#include "xmrstak/jconf.hpp"
 
+#include <atomic>
+#include <future>
 #include <iostream>
 #include <thread>
 #include <vector>
-#include <atomic>
-#include <future>
 
 namespace xmrstak
 {
@@ -18,7 +18,7 @@ namespace cpu
 
 class minethd : public iBackend
 {
-public:
+  public:
 	static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
@@ -29,19 +29,18 @@ class minethd : public iBackend
 
 	static cryptonight_ctx* minethd_alloc_ctx();
 
-	template<size_t N>
+	template <size_t N>
 	static void func_multi_selector(cryptonight_ctx**, minethd::cn_on_new_job& on_new_job,
-			bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off");
+		bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off");
 
-	private:
-		
+  private:
 	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
-	template<uint32_t N>
+	template <uint32_t N>
 	void multiway_work_main();
 
-	template<size_t N>
-	void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce);
+	template <size_t N>
+	void prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce);
 
 	void work_main();
 	void double_work_main();
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index e58665922..262865ea0 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -1,9 +1,9 @@
 #pragma once
-#include <stddef.h>
+#include <array>
 #include <inttypes.h>
-#include <type_traits>
+#include <stddef.h>
 #include <string>
-#include <array>
+#include <type_traits>
 
 constexpr size_t start_derived_algo_id = 1000;
 
@@ -15,10 +15,10 @@ enum xmrstak_algo_id
 	cryptonight_monero = 3,
 	cryptonight_heavy = 4,
 	cryptonight_aeon = 5,
-	cryptonight_ipbc = 6, // equal to cryptonight_aeon with a small tweak in the miner code
-	cryptonight_stellite = 7, //equal to cryptonight_monero but with one tiny change
-	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
-	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
+	cryptonight_ipbc = 6,	  // equal to cryptonight_aeon with a small tweak in the miner code
+	cryptonight_stellite = 7,  //equal to cryptonight_monero but with one tiny change
+	cryptonight_masari = 8,	//equal to cryptonight_monero but with less iterations, used by masari
+	cryptonight_haven = 9,	 // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
 	cryptonight_monero_v8 = 11,
 	cryptonight_superfast = 12,
@@ -42,35 +42,32 @@ enum xmrstak_algo_id
 inline std::string get_algo_name(xmrstak_algo_id algo_id)
 {
 	static std::array<std::string, 18> base_algo_names =
-	{{
-		"invalid_algo",
-		"cryptonight",
-		"cryptonight_lite",
-		"cryptonight_v7",
-		"cryptonight_heavy",
-		"cryptonight_lite_v7",
-		"cryptonight_lite_v7_xor",
-		"cryptonight_v7_stellite",
-		"cryptonight_masari",
-		"cryptonight_haven",
-		"cryptonight_bittube2",
-		"cryptonight_v8",
-		"cryptonight_superfast",
-		"cryptonight_gpu",
-		"cryptonight_conceal",
-		"cryptonight_r_wow",
-		"cryptonight_r",
-		"cryptonight_v8_reversewaltz" // used by graft
-	}};
+		{{
+			"invalid_algo",
+			"cryptonight",
+			"cryptonight_lite",
+			"cryptonight_v7",
+			"cryptonight_heavy",
+			"cryptonight_lite_v7",
+			"cryptonight_lite_v7_xor",
+			"cryptonight_v7_stellite",
+			"cryptonight_masari",
+			"cryptonight_haven",
+			"cryptonight_bittube2",
+			"cryptonight_v8",
+			"cryptonight_superfast",
+			"cryptonight_gpu",
+			"cryptonight_conceal",
+			"cryptonight_r_wow",
+			"cryptonight_r",
+			"cryptonight_v8_reversewaltz" // used by graft
+		}};
 
 	static std::array<std::string, 4> derived_algo_names =
-	{{
-		"cryptonight_turtle",
-		"cryptonight_v8_half", // used by masari and stellite
-		"cryptonight_v8_zelerius",
-		"cryptonight_v8_double"
-	}};
-
+		{{"cryptonight_turtle",
+			"cryptonight_v8_half", // used by masari and stellite
+			"cryptonight_v8_zelerius",
+			"cryptonight_v8_double"}};
 
 	if(algo_id < start_derived_algo_id)
 		return base_algo_names[algo_id];
@@ -80,19 +77,35 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id)
 
 struct xmrstak_algo
 {
-	xmrstak_algo(xmrstak_algo_id name_id) : algo_name(name_id), base_algo(name_id)
+	xmrstak_algo(xmrstak_algo_id name_id) :
+		algo_name(name_id),
+		base_algo(name_id)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : algo_name(name_id), base_algo(algorithm)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) :
+		algo_name(name_id),
+		base_algo(algorithm)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : algo_name(name_id), base_algo(algorithm), iter(iteration)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) :
+		algo_name(name_id),
+		base_algo(algorithm),
+		iter(iteration)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) :
+		algo_name(name_id),
+		base_algo(algorithm),
+		iter(iteration),
+		mem(memory)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory), mask(mem_mask)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) :
+		algo_name(name_id),
+		base_algo(algorithm),
+		iter(iteration),
+		mem(memory),
+		mask(mem_mask)
 	{
 	}
 
@@ -187,35 +200,33 @@ constexpr uint32_t CN_DOUBLE_ITER = 0x100000;
 
 inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
-	static std::array<xmrstak_algo, 18> pow = {{
-		{invalid_algo, invalid_algo},
+	static std::array<xmrstak_algo, 18> pow = {{{invalid_algo, invalid_algo},
 		{cryptonight, cryptonight, CN_ITER, CN_MEMORY},
-		{cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2},
+		{cryptonight_lite, cryptonight_lite, CN_ITER / 2, CN_MEMORY / 2},
 		{cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY},
-		{cryptonight_heavy, cryptonight_heavy, CN_ITER/2, CN_MEMORY*2},
-		{cryptonight_aeon, cryptonight_aeon, CN_ITER/2, CN_MEMORY/2},
-		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER/2, CN_MEMORY/2}, // equal to cryptonight_aeon with a small tweak in the miner code
-		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change
-		{cryptonight_masari, cryptonight_masari, CN_ITER/2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari
-		{cryptonight_haven, cryptonight_haven, CN_ITER/2, CN_MEMORY*2}, // equal to cryptonight_heavy with a small tweak
-		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER/2, CN_MEMORY*2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+		{cryptonight_heavy, cryptonight_heavy, CN_ITER / 2, CN_MEMORY * 2},
+		{cryptonight_aeon, cryptonight_aeon, CN_ITER / 2, CN_MEMORY / 2},
+		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2},		  // equal to cryptonight_aeon with a small tweak in the miner code
+		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY},		  //equal to cryptonight_monero but with one tiny change
+		{cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY},		  //equal to cryptonight_monero but with less iterations, used by masari
+		{cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2},		  // equal to cryptonight_heavy with a small tweak
+		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
 		{cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY},
-		{cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY},
+		{cryptonight_superfast, cryptonight_superfast, CN_ITER / 4, CN_MEMORY},
 		{cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK},
-		{cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY},
+		{cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY},
 		{cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY},
 		{cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY},
-		{cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}
-	}};
+		{cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}}};
 
 	static std::array<xmrstak_algo, 4> derived_pow =
-	{{
-		{cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK},
-		{cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY},
-		{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY},
-		{cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY}
-		// {cryptonight_derived}
-	}};
+		{{
+			{cryptonight_turtle, cryptonight_monero_v8, CN_ITER / 8, CN_MEMORY / 8, CN_TURTLE_MASK},
+			{cryptonight_v8_half, cryptonight_monero_v8, CN_ITER / 2, CN_MEMORY},
+			{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY},
+			{cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY}
+			// {cryptonight_derived}
+		}};
 
 	if(algo_id < start_derived_algo_id)
 		return pow[algo_id];
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
index 52ef3f391..5b4332ba4 100644
--- a/xmrstak/backend/globalStates.cpp
+++ b/xmrstak/backend/globalStates.cpp
@@ -21,15 +21,14 @@
   *
   */
 
-#include "miner_work.hpp"
 #include "globalStates.hpp"
+#include "miner_work.hpp"
 
 #include <assert.h>
-#include <cmath>
 #include <chrono>
+#include <cmath>
 #include <cstring>
 
-
 namespace xmrstak
 {
 
diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp
index d6966c4a2..a67580166 100644
--- a/xmrstak/backend/globalStates.hpp
+++ b/xmrstak/backend/globalStates.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
 #include "xmrstak/backend/miner_work.hpp"
-#include "xmrstak/misc/environment.hpp"
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/backend/pool_data.hpp"
 #include "xmrstak/cpputil/read_write_lock.h"
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/environment.hpp"
 
 #include <atomic>
 
@@ -17,7 +17,11 @@ struct globalStates
 	{
 		auto& env = environment::inst();
 		if(env.pglobalStates == nullptr)
-			env.pglobalStates = new globalStates;
+		{
+			std::unique_lock<std::mutex> lck(env.update);
+			if(env.pglobalStates == nullptr)
+				env.pglobalStates = new globalStates;
+		}
 		return *env.pglobalStates;
 	}
 
@@ -32,7 +36,7 @@ struct globalStates
 			nonce = iGlobalNonce.fetch_add(reserve_count);
 	}
 
-	void consume_work( miner_work& threadWork, uint64_t& currentJobId);
+	void consume_work(miner_work& threadWork, uint64_t& currentJobId);
 
 	miner_work oGlobalWork;
 	std::atomic<uint64_t> iGlobalJobNo;
@@ -41,8 +45,11 @@ struct globalStates
 	uint64_t iThreadCount;
 	size_t pool_id = invalid_pool_id;
 
-private:
-	globalStates() : iThreadCount(0), iGlobalJobNo(0), iConsumeCnt(0)
+  private:
+	globalStates() :
+		iThreadCount(0),
+		iGlobalJobNo(0),
+		iConsumeCnt(0)
 	{
 	}
 
diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp
index 18411b79c..3ca598bdd 100644
--- a/xmrstak/backend/iBackend.hpp
+++ b/xmrstak/backend/iBackend.hpp
@@ -1,12 +1,13 @@
 #pragma once
 
 #include "xmrstak/backend/globalStates.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 
 #include <atomic>
-#include <cstdint>
 #include <climits>
-#include <vector>
+#include <cstdint>
 #include <string>
+#include <vector>
 
 template <typename T, std::size_t N>
 constexpr std::size_t countof(T const (&)[N]) noexcept
@@ -16,35 +17,66 @@ constexpr std::size_t countof(T const (&)[N]) noexcept
 
 namespace xmrstak
 {
-	struct iBackend
+struct iBackend
+{
+
+	enum BackendType : uint32_t
 	{
+		UNKNOWN = 0u,
+		CPU = 1u,
+		AMD = 2u,
+		NVIDIA = 3u
+	};
 
-		enum BackendType : uint32_t { UNKNOWN = 0u, CPU = 1u, AMD = 2u, NVIDIA = 3u };
+	static const char* getName(const BackendType type)
+	{
+		const char* backendNames[] = {
+			"unknown",
+			"cpu",
+			"amd",
+			"nvidia"};
 
-		static const char* getName(const BackendType type)
-		{
-			const char* backendNames[] = {
-				"unknown",
-				"cpu",
-				"amd",
-				"nvidia"
-			};
-
-			uint32_t i = static_cast<uint32_t>(type);
-			if(i >= countof(backendNames))
-				i = 0;
-
-			return backendNames[i];
-		}
+		uint32_t i = static_cast<uint32_t>(type);
+		if(i >= countof(backendNames))
+			i = 0;
+
+		return backendNames[i];
+	}
 
-		std::atomic<uint64_t> iHashCount;
-		std::atomic<uint64_t> iTimestamp;
-		uint32_t iThreadNo;
-		BackendType backendType = UNKNOWN;
+	std::atomic<uint64_t> iHashCount;
+	std::atomic<uint64_t> iTimestamp;
+	uint32_t iThreadNo;
+	uint32_t iGpuIndex;
+	BackendType backendType = UNKNOWN;
+	uint64_t iLastStamp = get_timestamp_ms();
+	double avgHashPerMsec = 0.0;
 
-		iBackend() : iHashCount(0), iTimestamp(0)
+	void updateStats(uint64_t numNewHashes, size_t poolId)
+	{
+		uint64_t iStamp = get_timestamp_ms();
+		double timeDiff = static_cast<double>(iStamp - iLastStamp);
+		iLastStamp = iStamp;
+
+		if(poolId == 0)
 		{
+			// if dev pool is active interpolate the number of shares (avoid hash rate drops)
+			numNewHashes = static_cast<uint64_t>(avgHashPerMsec * timeDiff);
 		}
-	};
+		else
+		{
+			const double hashRatePerMs = static_cast<double>(numNewHashes) / timeDiff;
+			constexpr double averagingBias = 0.1;
+			avgHashPerMsec = avgHashPerMsec * (1.0 - averagingBias) + hashRatePerMs * averagingBias;
+		}
+		iHashCount.fetch_add(numNewHashes, std::memory_order_relaxed);
+		iTimestamp.store(iStamp, std::memory_order_relaxed);
+	}
+
+	iBackend() :
+		iHashCount(0),
+		iTimestamp(0)
+	{
+	}
+};
 
 } // namespace xmrstak
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index d0e5237f2..114f2db8e 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -2,95 +2,110 @@
 
 #include "xmrstak/backend/pool_data.hpp"
 
-#include <thread>
 #include <atomic>
-#include <mutex>
-#include <cstdint>
-#include <iostream>
 #include <cassert>
+#include <cstdint>
 #include <cstring>
+#include <iostream>
+#include <mutex>
+#include <thread>
 
 namespace xmrstak
 {
-	struct miner_work
+struct miner_work
+{
+	char sJobID[64];
+	uint8_t bWorkBlob[128];
+	uint32_t iWorkSize;
+	uint64_t iTarget;
+	bool bNiceHash;
+	bool bStall;
+	size_t iPoolId;
+	uint64_t iBlockHeight;
+	uint8_t* ref_ptr;
+
+	miner_work() :
+		iWorkSize(0),
+		bNiceHash(false),
+		bStall(true),
+		iPoolId(invalid_pool_id),
+		ref_ptr((uint8_t*)&iBlockHeight) {}
+
+	miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
+		uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) :
+		iWorkSize(iWorkSize),
+		iTarget(iTarget),
+		bNiceHash(bNiceHash),
+		bStall(false),
+		iPoolId(iPoolId),
+		iBlockHeight(iBlockHeiht),
+		ref_ptr((uint8_t*)&iBlockHeight)
 	{
-		char        sJobID[64];
-		uint8_t     bWorkBlob[128];
-		uint32_t    iWorkSize;
-		uint64_t    iTarget;
-		bool        bNiceHash;
-		bool        bStall;
-		size_t      iPoolId;
-		uint64_t	iBlockHeight;
-		uint8_t*	ref_ptr;
-
-		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id), ref_ptr((uint8_t*)&iBlockHeight) { }
-
-		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
-			uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize),
-			iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight)
-		{
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(this->bWorkBlob, bWork, iWorkSize);
-			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
-		}
-
-		miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),
-			bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight)
-		{
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
-			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
-		}
-
-		miner_work(miner_work const&) = delete;
-
-		miner_work& operator=(miner_work&& from)
-		{
-			assert(this != &from);
-
-			iBlockHeight = from.iBlockHeight;
-			iPoolId = from.iPoolId;
-			bStall = from.bStall;
-			iWorkSize = from.iWorkSize;
-			bNiceHash = from.bNiceHash;
-			iTarget = from.iTarget;
-
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(sJobID, from.sJobID, sizeof(sJobID));
-			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(this->bWorkBlob, bWork, iWorkSize);
+		memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+	}
+
+	miner_work(miner_work&& from) :
+		iWorkSize(from.iWorkSize),
+		iTarget(from.iTarget),
+		bStall(from.bStall),
+		iPoolId(from.iPoolId),
+		iBlockHeight(from.iBlockHeight),
+		ref_ptr((uint8_t*)&iBlockHeight)
+	{
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+		memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+	}
 
-			return *this;
-		}
+	miner_work(miner_work const&) = delete;
+
+	miner_work& operator=(miner_work&& from)
+	{
+		assert(this != &from);
 
-		miner_work& operator=(miner_work const& from)
-		{
-			assert(this != &from);
+		iBlockHeight = from.iBlockHeight;
+		iPoolId = from.iPoolId;
+		bStall = from.bStall;
+		iWorkSize = from.iWorkSize;
+		bNiceHash = from.bNiceHash;
+		iTarget = from.iTarget;
 
-			iBlockHeight = from.iBlockHeight;
-			iPoolId = from.iPoolId;
-			bStall = from.bStall;
-			iWorkSize = from.iWorkSize;
-			bNiceHash = from.bNiceHash;
-			iTarget = from.iTarget;
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(sJobID, from.sJobID, sizeof(sJobID));
+		memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
 
-			if(!ref_ptr)
-				return *this;
+		return *this;
+	}
 
-			for(size_t i=0; i <= 7 && iPoolId; i++)
-				ref_ptr[i] = from.ref_ptr[7-i];
+	miner_work& operator=(miner_work const& from)
+	{
+		assert(this != &from);
 
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(sJobID, from.sJobID, sizeof(sJobID));
-			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+		iBlockHeight = from.iBlockHeight;
+		iPoolId = from.iPoolId;
+		bStall = from.bStall;
+		iWorkSize = from.iWorkSize;
+		bNiceHash = from.bNiceHash;
+		iTarget = from.iTarget;
 
+		if(!ref_ptr)
 			return *this;
-		}
 
-		uint8_t getVersion() const
-		{
-			return bWorkBlob[0];
-		}
+		for(size_t i = 0; i <= 7 && iPoolId; i++)
+			ref_ptr[i] = from.ref_ptr[7 - i];
+
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(sJobID, from.sJobID, sizeof(sJobID));
+		memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
 
-	};
+		return *this;
+	}
+
+	uint8_t getVersion() const
+	{
+		return bWorkBlob[0];
+	}
+};
 } // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
index f1bf75819..a7587cbe0 100644
--- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
@@ -14,17 +14,17 @@
  *
  */
 
-#include <string>
-#include <sstream>
-#include <mutex>
 #include <cstring>
+#include <mutex>
 #include <nvrtc.h>
+#include <sstream>
+#include <string>
 #include <thread>
 
-#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
 #include "xmrstak/backend/cpu/crypto/variant4_random_math.h"
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
 #include "xmrstak/cpputil/read_write_lock.h"
+#include "xmrstak/misc/console.hpp"
 
 namespace xmrstak
 {
@@ -33,80 +33,82 @@ namespace nvidia
 
 static std::string get_code(const V4_Instruction* code, int code_size)
 {
-    std::stringstream s;
+	std::stringstream s;
 
-    for (int i = 0; i < code_size; ++i)
-    {
-        const V4_Instruction inst = code[i];
+	for(int i = 0; i < code_size; ++i)
+	{
+		const V4_Instruction inst = code[i];
 
-        const uint32_t a = inst.dst_index;
-        const uint32_t b = inst.src_index;
+		const uint32_t a = inst.dst_index;
+		const uint32_t b = inst.src_index;
 
-        switch (inst.opcode)
-        {
-        case MUL:
-            s << 'r' << a << "*=r" << b << ';';
-            break;
+		switch(inst.opcode)
+		{
+		case MUL:
+			s << 'r' << a << "*=r" << b << ';';
+			break;
 
-        case ADD:
-            s << 'r' << a << "+=r" << b << '+' << inst.C << "U;";
-            break;
+		case ADD:
+			s << 'r' << a << "+=r" << b << '+' << inst.C << "U;";
+			break;
 
-        case SUB:
-            s << 'r' << a << "-=r" << b << ';';
-            break;
+		case SUB:
+			s << 'r' << a << "-=r" << b << ';';
+			break;
 
-        case ROR:
-            s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");";
-            break;
+		case ROR:
+			s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");";
+			break;
 
-        case ROL:
-            s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");";
-            break;
+		case ROL:
+			s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");";
+			break;
 
-        case XOR:
-            s << 'r' << a << "^=r" << b << ';';
-            break;
-        }
+		case XOR:
+			s << 'r' << a << "^=r" << b << ';';
+			break;
+		}
 
-        s << '\n';
-    }
+		s << '\n';
+	}
 
-    return s.str();
+	return s.str();
 }
 
 struct CacheEntry
 {
-    CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector<char>& ptx, const std::string& lowered_name) :
-        algo(algo),
-        height(height),
-        arch_major(arch_major),
-        arch_minor(arch_minor),
-        ptx(ptx),
-        lowered_name(lowered_name)
-    {}
-
-    xmrstak_algo algo;
-    uint64_t height;
-    int arch_major;
-    int arch_minor;
-    std::vector<char> ptx;
-    std::string lowered_name;
+	CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector<char>& ptx, const std::string& lowered_name) :
+		algo(algo),
+		height(height),
+		arch_major(arch_major),
+		arch_minor(arch_minor),
+		ptx(ptx),
+		lowered_name(lowered_name)
+	{
+	}
+
+	xmrstak_algo algo;
+	uint64_t height;
+	int arch_major;
+	int arch_minor;
+	std::vector<char> ptx;
+	std::string lowered_name;
 };
 
 struct BackgroundTaskBase
 {
-    virtual ~BackgroundTaskBase() {}
-    virtual void exec() = 0;
+	virtual ~BackgroundTaskBase() {}
+	virtual void exec() = 0;
 };
 
-template<typename T>
+template <typename T>
 struct BackgroundTask : public BackgroundTaskBase
 {
-    BackgroundTask(T&& func) : m_func(std::move(func)) {}
-    void exec() override { m_func(); }
+	BackgroundTask(T&& func) :
+		m_func(std::move(func)) {}
+	void exec() override { m_func(); }
 
-    T m_func;
+	T m_func;
 };
 
 static ::cpputil::RWLock CryptonightR_cache_mutex;
@@ -119,155 +121,165 @@ static std::thread* background_thread = nullptr;
 
 static void background_thread_proc()
 {
-    std::vector<BackgroundTaskBase*> tasks;
-    for (;;) {
-        tasks.clear();
-        {
-            std::lock_guard<std::mutex> g(background_tasks_mutex);
-            background_tasks.swap(tasks);
-        }
-
-        for (BackgroundTaskBase* task : tasks) {
-            task->exec();
-            delete task;
-        }
-
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    }
+	std::vector<BackgroundTaskBase*> tasks;
+	for(;;)
+	{
+		tasks.clear();
+		{
+			std::lock_guard<std::mutex> g(background_tasks_mutex);
+			background_tasks.swap(tasks);
+		}
+
+		for(BackgroundTaskBase* task : tasks)
+		{
+			task->exec();
+			delete task;
+		}
+
+		std::this_thread::sleep_for(std::chrono::milliseconds(500));
+	}
 }
 
-template<typename T>
+template <typename T>
 static void background_exec(T&& func)
 {
-    BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
-
-    std::lock_guard<std::mutex> g(background_tasks_mutex);
-    background_tasks.push_back(task);
-    if (!background_thread) {
-        background_thread = new std::thread(background_thread_proc);
-    }
+	BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
+
+	std::lock_guard<std::mutex> g(background_tasks_mutex);
+	background_tasks.push_back(task);
+	if(!background_thread)
+	{
+		background_thread = new std::thread(background_thread_proc);
+	}
 }
 
 static void CryptonightR_build_program(
-    std::vector<char>& ptx,
-    std::string& lowered_name,
-    const xmrstak_algo& algo,
-    uint64_t height,
-    uint32_t precompile_count,
-    int arch_major,
-    int arch_minor,
-    std::string source)
+	std::vector<char>& ptx,
+	std::string& lowered_name,
+	const xmrstak_algo& algo,
+	uint64_t height,
+	uint32_t precompile_count,
+	int arch_major,
+	int arch_minor,
+	std::string source)
 {
-    {
+	{
 		CryptonightR_cache_mutex.WriteLock();
 
-        // Remove old programs from cache
-        for (size_t i = 0; i < CryptonightR_cache.size();)
-        {
-            const CacheEntry& entry = CryptonightR_cache[i];
-            if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
-                CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
-                CryptonightR_cache.pop_back();
-            }
-            else
-            {
-                ++i;
-            }
-        }
+		// Remove old programs from cache
+		for(size_t i = 0; i < CryptonightR_cache.size();)
+		{
+			const CacheEntry& entry = CryptonightR_cache[i];
+			if((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
+				CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
+				CryptonightR_cache.pop_back();
+			}
+			else
+			{
+				++i;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
+	}
 
-    ptx.clear();
-    ptx.reserve(65536);
+	ptx.clear();
+	ptx.reserve(65536);
 
-    std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
-    {
+	std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
+	{
 		CryptonightR_cache_mutex.ReadLock();
 
-        // Check if the cache already has this program (some other thread might have added it first)
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
-            {
-                ptx = entry.ptx;
-                lowered_name = entry.lowered_name;
+		// Check if the cache already has this program (some other thread might have added it first)
+		for(const CacheEntry& entry : CryptonightR_cache)
+		{
+			if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
+			{
+				ptx = entry.ptx;
+				lowered_name = entry.lowered_name;
 				CryptonightR_cache_mutex.UnLock();
-                return;
-            }
-        }
+				return;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
-
-    nvrtcProgram prog;
-    nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result));
-        return;
-    }
-
-    result = nvrtcAddNameExpression(prog, "CryptonightR_phase2");
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    char opt0[64];
-    sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);
-
-    char opt1[64];
-    sprintf(opt1, "-DALGO=%d", static_cast<int>(algo.Id()));
-
-	const char* opts[2] = { opt0, opt1 };
-
-    result = nvrtcCompileProgram(prog, 2, opts);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result));
-
-        size_t logSize;
-        if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) {
-            char *log = new char[logSize];
-            if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) {
-                printer::inst()->print_msg(L0, "Program compile log: %s", log);
-            }
-            delete[]log;
-        }
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-
-    const char* name;
-    result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    size_t ptxSize;
-    result = nvrtcGetPTXSize(prog, &ptxSize);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    ptx.resize(ptxSize);
-    result = nvrtcGetPTX(prog, ptx.data());
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    lowered_name = name;
-
-    nvrtcDestroyProgram(&prog);
-
-    printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
+	}
+
+	nvrtcProgram prog;
+	nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result));
+		return;
+	}
+
+	result = nvrtcAddNameExpression(prog, "CryptonightR_phase2");
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	char opt0[64];
+	sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);
+
+	char opt1[64];
+	sprintf(opt1, "-DALGO=%d", static_cast<int>(algo.Id()));
+
+	const char* opts[2] = {opt0, opt1};
+
+	result = nvrtcCompileProgram(prog, 2, opts);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result));
+
+		size_t logSize;
+		if(nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS)
+		{
+			char* log = new char[logSize];
+			if(nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS)
+			{
+				printer::inst()->print_msg(L0, "Program compile log: %s", log);
+			}
+			delete[] log;
+		}
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	const char* name;
+	result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	size_t ptxSize;
+	result = nvrtcGetPTXSize(prog, &ptxSize);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	ptx.resize(ptxSize);
+	result = nvrtcGetPTX(prog, ptx.data());
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	lowered_name = name;
+
+	nvrtcDestroyProgram(&prog);
+
+	printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
 
 	CryptonightR_cache_mutex.WriteLock();
 	CryptonightR_cache.emplace_back(algo, height, arch_major, arch_minor, ptx, lowered_name);
@@ -276,62 +288,63 @@ static void CryptonightR_build_program(
 
 void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background)
 {
-    if (background) {
-        background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); });
-        return;
-    }
-
-    ptx.clear();
-
-    const char* source_code_template =
-        #include "nvcc_code/cuda_cryptonight_r.curt"
-    ;
-    const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
-    const char* offset = strstr(source_code_template, include_name);
-    if (!offset)
-    {
-        printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt");
-        return;
-    }
-
-    V4_Instruction code[256];
-    int code_size;
-    switch (algo.Id())
-    {
-    case cryptonight_r_wow:
-        code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
-        break;
-    case cryptonight_r:
-        code_size = v4_random_math_init<cryptonight_r>(code, height);
-        break;
-        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
-        return;
-    }
-
-    std::string source_code(source_code_template, offset);
-    source_code.append(get_code(code, code_size));
-    source_code.append(offset + sizeof(include_name) - 1);
-
-    {
+	if(background)
+	{
+		background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); });
+		return;
+	}
+
+	ptx.clear();
+
+	const char* source_code_template =
+#include "nvcc_code/cuda_cryptonight_r.curt"
+		;
+	const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
+	const char* offset = strstr(source_code_template, include_name);
+	if(!offset)
+	{
+		printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt");
+		return;
+	}
+
+	V4_Instruction code[256];
+	int code_size;
+	switch(algo.Id())
+	{
+	case cryptonight_r_wow:
+		code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
+		break;
+	case cryptonight_r:
+		code_size = v4_random_math_init<cryptonight_r>(code, height);
+		break;
+		printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
+		return;
+	}
+
+	std::string source_code(source_code_template, offset);
+	source_code.append(get_code(code, code_size));
+	source_code.append(offset + sizeof(include_name) - 1);
+
+	{
 		CryptonightR_cache_mutex.ReadLock();
 
-        // Check if the cache has this program
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
-                ptx = entry.ptx;
-                lowered_name = entry.lowered_name;
+		// Check if the cache has this program
+		for(const CacheEntry& entry : CryptonightR_cache)
+		{
+			if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
+				ptx = entry.ptx;
+				lowered_name = entry.lowered_name;
 				CryptonightR_cache_mutex.UnLock();
-                return;
-            }
-        }
+				return;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
+	}
 
-    CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code);
+	CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code);
 }
 
+} // namespace nvidia
 } // namespace xmrstak
-} //namespace nvidia
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
index c3d8827b0..30abf2e59 100644
--- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
@@ -19,9 +19,8 @@
 #include "xmrstak/backend/cryptonight.hpp"
 
 #include <stdint.h>
-#include <vector>
 #include <string>
-
+#include <vector>
 
 namespace xmrstak
 {
@@ -29,9 +28,7 @@ namespace nvidia
 {
 
 void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name,
-	const xmrstak_algo algo, uint64_t height,  uint32_t precompile_count, int arch_major, int arch_minor, bool background = false);
-
+	const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background = false);
 
+} // namespace nvidia
 } // namespace xmrstak
-} //namespace nvidia
-
diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index 2755e03d2..bf195f768 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -3,17 +3,16 @@
 
 #include "autoAdjust.hpp"
 
-#include "nvcc_code/cryptonight.hpp"
 #include "jconf.hpp"
-#include "xmrstak/misc/console.hpp"
+#include "nvcc_code/cryptonight.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
 
-#include <vector>
 #include <cstdio>
 #include <sstream>
 #include <string>
-
+#include <vector>
 
 namespace xmrstak
 {
@@ -22,11 +21,9 @@ namespace nvidia
 
 class autoAdjust
 {
-public:
-
+  public:
 	autoAdjust()
 	{
-
 	}
 
 	/** print the adjusted values if needed
@@ -63,45 +60,69 @@ class autoAdjust
 				nvidCtxVec.push_back(ctx);
 			else
 				printer::inst()->print_msg(L0, "WARNING: NVIDIA setup failed for GPU %d.\n", i);
-
 		}
 
 		generateThreadConfig();
 		return true;
-
 	}
 
-private:
-
+  private:
 	void generateThreadConfig()
 	{
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
+		const char* tpl =
+#include "./config.tpl"
+			;
 
 		configEditor configTpl{};
-		configTpl.set( std::string(tpl) );
+		configTpl.set(std::string(tpl));
 
 		constexpr size_t byte2mib = 1024u * 1024u;
 		std::string conf;
 		for(auto& ctx : nvidCtxVec)
 		{
+			std::string enabledGpus = params::inst().nvidiaGpus;
+			bool enabled = true;
+			if (!enabledGpus.empty())
+			{
+				enabled = false;
+				std::stringstream ss(enabledGpus);
+
+				int i = -1;
+				while (ss >> i)
+				{
+					if (i == ctx.device_id)
+					{
+						enabled = true;
+						break;
+					}
+
+					while (ss.peek() == ',' || ss.peek() == ' ')
+						ss.ignore();
+				}
+			}
+
 			if(ctx.device_threads * ctx.device_blocks > 0)
 			{
+				if (!enabled)
+					conf += "/* Disabled\n";
+
 				conf += std::string("  // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n";
-				conf += std::string("  //      memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/"  + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n";
+				conf += std::string("  //      memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n";
 				conf += std::string("  //      smx: ") + std::to_string(ctx.device_mpcount) + "\n";
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
-					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
-					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
-					"    \"mem_mode\" : 1,\n" +
-					"  },\n";
+						"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
+						"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
+						"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
+						"    \"mem_mode\" : 1,\n" +
+						"  },\n";
+
+				if (!enabled)
+					conf += "*/\n";
 			}
 		}
 
-		configTpl.replace("GPUCONFIG",conf);
+		configTpl.replace("GPUCONFIG", conf);
 		configTpl.write(params::inst().configFileNVIDIA);
 		printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", params::inst().configFileNVIDIA.c_str());
 	}
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index 6c443343b..1cd113c4d 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -22,8 +22,8 @@
   */
 
 #include "jconf.hpp"
-#include "xmrstak/misc/jext.hpp"
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/jext.hpp"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -36,7 +36,6 @@
 #include <cpuid.h>
 #endif
 
-
 namespace xmrstak
 {
 namespace nvidia
@@ -47,9 +46,13 @@ using namespace rapidjson;
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum { aGpuThreadsConf };
+enum configEnum
+{
+	aGpuThreadsConf
+};
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -58,8 +61,7 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aGpuThreadsConf, "gpu_threads_conf", kNullType }
-};
+	{aGpuThreadsConf, "gpu_threads_conf", kNullType}};
 
 inline bool checkType(Type have, Type want)
 {
@@ -75,9 +77,7 @@ inline bool checkType(Type have, Type want)
 		return false;
 }
 
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
-
-
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
 struct jconf::opaque_private
 {
@@ -89,7 +89,6 @@ struct jconf::opaque_private
 	}
 };
 
-
 bool jconf::NeedsAutoconf()
 {
 	return !prv->configValues[aGpuThreadsConf]->IsArray();
@@ -110,7 +109,7 @@ size_t jconf::GetGPUThreadCount()
 		return 0;
 }
 
-bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
+bool jconf::GetGPUThreadConfig(size_t id, thd_cfg& cfg)
 {
 	if(!prv->configValues[aGpuThreadsConf]->IsArray())
 		return false;
@@ -170,7 +169,6 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
-
 	cfg.id = gid->GetInt();
 	cfg.blocks = blocks->GetInt();
 	cfg.threads = threads->GetInt();
@@ -178,7 +176,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	cfg.bsleep = bsleep->GetInt();
 	cfg.syncMode = syncMode->GetInt();
 	cfg.memMode = memMode->GetInt();
-	
+
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt();
 	else
@@ -189,22 +187,22 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 
 bool jconf::parse_config(const char* sFilename)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -219,7 +217,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -241,7 +239,7 @@ bool jconf::parse_config(const char* sFilename)
 	buffer[flen] = '}';
 	buffer[flen + 1] = '\0';
 
-	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	prv->jsonDoc.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(prv->jsonDoc.HasParseError())
@@ -251,7 +249,6 @@ bool jconf::parse_config(const char* sFilename)
 		return false;
 	}
 
-
 	if(!prv->jsonDoc.IsObject())
 	{ //This should never happen as we created the root ourselves
 		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
@@ -262,7 +259,7 @@ bool jconf::parse_config(const char* sFilename)
 	{
 		if(oConfigValues[i].iName != i)
 		{
-			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s",oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s", oConfigValues[i].sName);
 			return false;
 		}
 
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index 40b72f880..e924c75a9 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -1,7 +1,7 @@
 #pragma once
+#include "xmrstak/params.hpp"
 #include <stdlib.h>
 #include <string>
-#include "xmrstak/params.hpp"
 
 namespace xmrstak
 {
@@ -10,16 +10,18 @@ namespace nvidia
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
-		if (oInst == nullptr) oInst = new jconf;
+		if(oInst == nullptr)
+			oInst = new jconf;
 		return oInst;
 	};
 
 	bool parse_config(const char* sFilename = params::inst().configFileNVIDIA.c_str());
 
-	struct thd_cfg {
+	struct thd_cfg
+	{
 		uint32_t id;
 		uint32_t blocks;
 		uint32_t threads;
@@ -36,17 +38,16 @@ class jconf
 
 	size_t GetGPUThreadCount();
 
-	bool GetGPUThreadConfig(size_t id, thd_cfg &cfg);
+	bool GetGPUThreadConfig(size_t id, thd_cfg& cfg);
 
 	bool NeedsAutoconf();
 
-private:
+  private:
 	jconf();
 	static jconf* oInst;
 
 	struct opaque_private;
 	opaque_private* prv;
-
 };
 
 } // namespace nvidia
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 80615d7a3..c65bba162 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -23,23 +23,23 @@
 
 #include "minethd.hpp"
 #include "autoAdjust.hpp"
-#include "xmrstak/misc/console.hpp"
-#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
+#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
+#include "xmrstak/backend/cpu/hwlocMemory.hpp"
 #include "xmrstak/backend/cpu/minethd.hpp"
-#include "xmrstak/params.hpp"
-#include "xmrstak/misc/executor.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/environment.hpp"
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
-#include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/misc/utility.hpp"
+#include "xmrstak/params.hpp"
 
 #include <assert.h>
-#include <cmath>
+#include <bitset>
 #include <chrono>
+#include <cmath>
 #include <thread>
-#include <bitset>
 #include <vector>
 
 #ifndef USE_PRECOMPILED_HEADERS
@@ -47,8 +47,8 @@
 #include <direct.h>
 #include <windows.h>
 #else
-#include <sys/types.h>
 #include <dlfcn.h>
+#include <sys/types.h>
 #endif
 #include <iostream>
 #endif
@@ -59,9 +59,9 @@ namespace nvidia
 {
 
 #ifdef WIN32
-	HINSTANCE lib_handle;
+HINSTANCE lib_handle;
 #else
-	void *lib_handle;
+void* lib_handle;
 #endif
 
 minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
@@ -70,6 +70,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	oWork = pWork;
 	bQuit = 0;
 	iThreadNo = (uint8_t)iNo;
+	this->iGpuIndex = cfg.id;
 	iJobNo = 0;
 
 	ctx.device_id = (int)cfg.id;
@@ -81,6 +82,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	ctx.memMode = cfg.memMode;
 	this->affinity = cfg.cpu_aff;
 
+	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> numa_guard = numa_promise.get_future();
 	thread_work_guard = thread_work_promise.get_future();
 
@@ -91,33 +93,32 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	 * without concurrent threads (CUDA driver is less occupied).
 	 */
 	numa_guard.wait();
-}
 
-void minethd::start_mining()
-{
-	thread_work_promise.set_value();
 	if(this->affinity >= 0) //-1 means no affinity
 		if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
 }
 
+void minethd::start_mining()
+{
+	thread_work_promise.set_value();
+}
 
 bool minethd::self_test()
 {
 	return true;
 }
 
-
 extern "C"
 {
 #ifdef WIN32
-__declspec(dllexport)
+	__declspec(dllexport)
 #endif
-std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
-{
-	environment::inst(&env);
-	return nvidia::minethd::thread_starter(threadOffset, pWork);
-}
+		std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
+	{
+		environment::inst(&env);
+		return nvidia::minethd::thread_starter(threadOffset, pWork);
+	}
 } // extern "C"
 
 std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork)
@@ -141,12 +142,12 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	int deviceCount = 0;
 	if(cuda_get_devicecount(&deviceCount) != 1)
 	{
-		std::cout<<"WARNING: NVIDIA no device found"<<std::endl;
+		std::cout << "WARNING: NVIDIA no device found" << std::endl;
 		return pvThreads;
 	}
 	else
 	{
-		std::cout<<"NVIDIA: found "<< deviceCount <<" potential device's"<<std::endl;
+		std::cout << "NVIDIA: found " << deviceCount << " potential device's" << std::endl;
 	}
 
 	size_t i, n = jconf::inst()->GetGPUThreadCount();
@@ -155,7 +156,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	cuInit(0);
 
 	jconf::thd_cfg cfg;
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		jconf::inst()->GetGPUThreadConfig(i, cfg);
 
@@ -172,10 +173,9 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 
 		minethd* thd = new minethd(pWork, i + threadOffset, cfg);
 		pvThreads->push_back(thd);
-
 	}
 
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		static_cast<minethd*>((*pvThreads)[i])->start_mining();
 	}
@@ -196,12 +196,12 @@ void minethd::work_main()
 
 	// numa memory bind and gpu memory is initialized
 	numa_promise.set_value();
-
+	std::unique_lock<std::mutex> lck(thd_aff_set);
+	lck.unlock();
 	std::this_thread::yield();
 	// wait until all NVIDIA devices are initialized
 	thread_work_guard.wait();
 
-	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 
@@ -216,16 +216,16 @@ void minethd::work_main()
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
-	while (bQuit == 0)
+	while(bQuit == 0)
 	{
-		if (oWork.bStall)
+		if(oWork.bStall)
 		{
 			/* We are stalled here because the executor didn't find a job for us yet,
 			 * either because of network latency, or a socket problem. Since we are
 			 * raison d'etre of this software it us sensible to just wait until we have something
 			 */
 
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			globalStates::inst().consume_work(oWork, iJobNo);
@@ -285,8 +285,8 @@ void minethd::work_main()
 			for(size_t i = 0; i < foundCount; i++)
 			{
 
-				uint8_t	bWorkBlob[128];
-				uint8_t	bResult[32];
+				uint8_t bWorkBlob[128];
+				uint8_t bResult[32];
 
 				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
 				memset(bResult, 0, sizeof(job_result::bResult));
@@ -294,19 +294,14 @@ void minethd::work_main()
 				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
 
 				cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
-				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
+				if((*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
 					executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId));
 			}
 
-			iCount += h_per_round;
 			iNonce += h_per_round;
-
-			using namespace std::chrono;
-			uint64_t iStamp = get_timestamp_ms();
-			iHashCount.store(iCount, std::memory_order_relaxed);
-			iTimestamp.store(iStamp, std::memory_order_relaxed);
+			updateStats(h_per_round, oWork.iPoolId);
 			std::this_thread::yield();
 		}
 
@@ -314,5 +309,5 @@ void minethd::work_main()
 	}
 }
 
+} // namespace nvidia
 } // namespace xmrstak
-} //namespace nvidia
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index 3863c93e8..bbbc7b6ee 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -1,19 +1,18 @@
 #pragma once
 
-#include "xmrstak/jconf.hpp"
 #include "jconf.hpp"
 #include "nvcc_code/cryptonight.hpp"
+#include "xmrstak/jconf.hpp"
 
 #include "xmrstak/backend/cpu/minethd.hpp"
 #include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/misc/environment.hpp"
 
+#include <atomic>
+#include <future>
 #include <iostream>
 #include <thread>
-#include <atomic>
 #include <vector>
-#include <future>
-
 
 namespace xmrstak
 {
@@ -22,12 +21,11 @@ namespace nvidia
 
 class minethd : public iBackend
 {
-public:
-
+  public:
 	static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
-private:
+  private:
 	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
@@ -44,6 +42,7 @@ class minethd : public iBackend
 
 	std::promise<void> numa_promise;
 	std::promise<void> thread_work_promise;
+	std::mutex thd_aff_set;
 
 	// block thread until all NVIDIA GPUs are initialized
 	std::future<void> thread_work_guard;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 906701893..29e29d12c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -3,35 +3,37 @@
 #include <stdint.h>
 #include <string>
 
-#include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/jconf.hpp"
 
 #include <cuda.h>
 
-typedef struct {
+typedef struct
+{
 	int device_id;
-	const char *device_name;
+	const char* device_name;
 	int device_arch[2];
 	int device_mpcount;
 	int device_blocks;
 	int device_threads;
 	int device_bfactor;
 	int device_bsleep;
+	int device_maxThreadsPerBlock;
 	int syncMode;
 	bool memMode;
 
-	uint32_t *d_input;
+	uint32_t* d_input;
 	uint32_t inputlen;
-	uint32_t *d_result_count;
-	uint32_t *d_result_nonce;
-	uint32_t *d_long_state;
-	uint32_t *d_ctx_state;
-	uint32_t *d_ctx_state2;
-	uint32_t *d_ctx_a;
-	uint32_t *d_ctx_b;
-	uint32_t *d_ctx_key1;
-	uint32_t *d_ctx_key2;
-	uint32_t *d_ctx_text;
+	uint32_t* d_result_count;
+	uint32_t* d_result_nonce;
+	uint32_t* d_long_state;
+	uint32_t* d_ctx_state;
+	uint32_t* d_ctx_state2;
+	uint32_t* d_ctx_a;
+	uint32_t* d_ctx_b;
+	uint32_t* d_ctx_key1;
+	uint32_t* d_ctx_key2;
+	uint32_t* d_ctx_text;
 	std::string name;
 	size_t free_device_memory;
 	size_t total_device_memory;
@@ -43,19 +45,20 @@ typedef struct {
 	xmrstak_algo cached_algo = {xmrstak_algo_id::invalid_algo};
 } nvid_ctx;
 
-extern "C" {
+extern "C"
+{
 
-/** get device count
+	/** get device count
  *
  * @param deviceCount[out] cuda device count
  * @return error code: 0 == error is occurred, 1 == no error
  */
-int cuda_get_devicecount( int* deviceCount);
-int cuda_get_deviceinfo(nvid_ctx *ctx);
-int cryptonight_extra_cpu_init(nvid_ctx *ctx);
-void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
-void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo);
-void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo);
+	int cuda_get_devicecount(int* deviceCount);
+	int cuda_get_deviceinfo(nvid_ctx* ctx);
+	int cryptonight_extra_cpu_init(nvid_ctx* ctx);
+	void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len);
+	void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo);
+	void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo);
 }
 
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
index 199025635..385afb9ec 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
@@ -3,8 +3,270 @@
 
 #include <stdint.h>
 
-#define N_COLS          4
-#define WPOLY           0x011b
+#include "cuda_extra.hpp"
+
+#define N_COLS 4
+#define WPOLY 0x011b
+
+static __constant__ uint32_t d_t_fn256[256 * 32] =
+{
+	0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U,
+	0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U,
+	0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU,
+	0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U,
+	0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU,
+	0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U,
+	0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU,
+	0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U,
+	0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U,
+	0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U,
+	0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU,
+	0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U,
+	0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U,
+	0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U,
+	0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU,
+	0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU,
+	0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU,
+	0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU,
+	0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U,
+	0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU,
+	0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU,
+	0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U,
+	0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU,
+	0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU,
+	0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U,
+	0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U,
+	0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU,
+	0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U,
+	0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U,
+	0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U,
+	0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U,
+	0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU,
+	0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U,
+	0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U,
+	0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU,
+	0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU,
+	0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU,
+	0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU,
+	0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U,
+	0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U,
+	0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U,
+	0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U,
+	0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U,
+	0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U,
+	0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U,
+	0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU,
+	0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U,
+	0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU,
+	0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U,
+	0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U,
+	0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U,
+	0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU,
+	0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U,
+	0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U,
+	0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU,
+	0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU,
+	0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU,
+	0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U,
+	0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU,
+	0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU,
+	0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU,
+	0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU,
+	0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU,
+	0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU,
+	0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U,
+	0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU,
+	0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U,
+	0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U,
+	0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U,
+	0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU,
+	0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U,
+	0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU,
+	0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U,
+	0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U,
+	0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U,
+	0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU,
+	0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U,
+	0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU,
+	0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU,
+	0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U,
+	0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U,
+	0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U,
+	0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U,
+	0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U,
+	0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U,
+	0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U,
+	0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U,
+	0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U,
+	0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U,
+	0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU,
+	0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U,
+	0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U,
+	0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U,
+	0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U,
+	0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U,
+	0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U,
+	0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU,
+	0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U,
+	0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU,
+	0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU,
+	0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U,
+	0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU,
+	0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U,
+	0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U,
+	0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU,
+	0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U,
+	0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U,
+	0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU,
+	0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U,
+	0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U,
+	0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U,
+	0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU,
+	0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U,
+	0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU,
+	0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U,
+	0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U,
+	0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU,
+	0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U,
+	0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U,
+	0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U,
+	0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U,
+	0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U,
+	0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU,
+	0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U,
+	0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U,
+	0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U,
+	0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU,
+	0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU,
+	0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U,
+	0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U,
+	0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U,
+	0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U,
+	0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU,
+	0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U,
+	0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U,
+	0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU,
+	0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U,
+	0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U,
+	0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU,
+	0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU,
+	0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U,
+	0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU,
+	0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U,
+	0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U,
+	0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U,
+	0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U,
+	0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU,
+	0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U,
+	0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U,
+	0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U,
+	0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU,
+	0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU,
+	0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU,
+	0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U,
+	0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU,
+	0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U,
+	0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U,
+	0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU,
+	0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U,
+	0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU,
+	0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU,
+	0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U,
+	0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U,
+	0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U,
+	0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U,
+	0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU,
+	0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U,
+	0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U,
+	0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU,
+	0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU,
+	0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U,
+	0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U,
+	0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U,
+	0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U,
+	0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U,
+	0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U,
+	0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U,
+	0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU,
+	0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU,
+	0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU,
+	0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U,
+	0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U,
+	0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU,
+	0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U,
+	0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U,
+	0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU,
+	0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U,
+	0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU,
+	0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU,
+	0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U,
+	0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U,
+	0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U,
+	0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU,
+	0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U,
+	0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU,
+	0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU,
+	0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U,
+	0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U,
+	0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U,
+	0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U,
+	0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU,
+	0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U,
+	0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U,
+	0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU,
+	0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U,
+	0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U,
+	0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU,
+	0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU,
+	0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U,
+	0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU,
+	0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U,
+	0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU,
+	0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U,
+	0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U,
+	0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U,
+	0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU,
+	0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U,
+	0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU,
+	0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU,
+	0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U,
+	0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U,
+	0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U,
+	0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU,
+	0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U,
+	0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U,
+	0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU,
+	0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU,
+	0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U,
+	0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U,
+	0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U,
+	0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U,
+	0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U,
+	0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU,
+	0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU,
+	0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U,
+	0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U,
+	0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U,
+	0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU,
+	0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U,
+	0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U,
+	0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U,
+	0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U,
+	0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U,
+	0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU,
+	0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U,
+	0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U,
+	0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U,
+	0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U,
+	0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U,
+	0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U,
+	0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU,
+	0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU,
+	0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU,
+	0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U,
+	0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU,
+0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU
+};
 
 static __constant__ uint32_t d_t_fn[1024] =
 {
@@ -263,27 +525,45 @@ static __constant__ uint32_t d_t_fn[1024] =
 	0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
 	0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
 	0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
-	0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
-};
+	0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U};
+
+#define t_fn32(x) (sharedMemory[(x) * 32])
 
-#define t_fn0(x) (sharedMemory[      (x)])
+#define t_fn0(x) (sharedMemory[(x)])
 #define t_fn1(x) (sharedMemory[256 + (x)])
 #define t_fn2(x) (sharedMemory[512 + (x)])
 #define t_fn3(x) (sharedMemory[768 + (x)])
 
+#define round(dummy, y, x, k)                                                                              \
+	y[0] = (k)[0] ^ t_fn0(BYTE_0(x[0])) ^ t_fn1(BYTE_1(x[1])) ^ t_fn2(BYTE_2(x[2])) ^ t_fn3(BYTE_3(x[3])); \
+	y[1] = (k)[1] ^ t_fn0(BYTE_0(x[1])) ^ t_fn1(BYTE_1(x[2])) ^ t_fn2(BYTE_2(x[3])) ^ t_fn3(BYTE_3(x[0])); \
+	y[2] = (k)[2] ^ t_fn0(BYTE_0(x[2])) ^ t_fn1(BYTE_1(x[3])) ^ t_fn2(BYTE_2(x[0])) ^ t_fn3(BYTE_3(x[1])); \
+	y[3] = (k)[3] ^ t_fn0(BYTE_0(x[3])) ^ t_fn1(BYTE_1(x[0])) ^ t_fn2(BYTE_2(x[1])) ^ t_fn3(BYTE_3(x[2]));
 
-#define round(dummy,y,x,k) \
-	y[0] = (k)[0]  ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \
-	y[1] = (k)[1]  ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \
-	y[2] = (k)[2]  ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \
-	y[3] = (k)[3]  ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) ));
 
-__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey)
+__device__ __forceinline__ static uint4 round32(const uint32_t* __restrict__ sharedMemory, const uint4& b, uint4 a)
+{                                                                                          \
+	a.x ^= t_fn32(BYTE_0(b.x));
+	a.y ^= t_fn32(BYTE_0(b.y));
+	a.z ^= t_fn32(BYTE_0(b.z));
+	a.w ^= t_fn32(BYTE_0(b.w));
+	a.x ^= ROTL32_8(t_fn32(BYTE_1(b.y)));
+	a.y ^= ROTL32_8(t_fn32(BYTE_1(b.z)));
+	a.z ^= ROTL32_8(t_fn32(BYTE_1(b.w)));
+	a.w ^= ROTL32_8(t_fn32(BYTE_1(b.x)));
+	a.x ^= ROTL32_16(t_fn32(BYTE_2(b.z))) ^ ROTL32_24(t_fn32(BYTE_3(b.w)));
+	a.y ^= ROTL32_16(t_fn32(BYTE_2(b.w))) ^ ROTL32_24(t_fn32(BYTE_3(b.x)));
+	a.z ^= ROTL32_16(t_fn32(BYTE_2(b.x))) ^ ROTL32_24(t_fn32(BYTE_3(b.y)));
+	a.w ^= ROTL32_16(t_fn32(BYTE_2(b.y))) ^ ROTL32_24(t_fn32(BYTE_3(b.z)));
+	return a;
+}
+
+__device__ __forceinline__ static void cn_aes_single_round(uint32_t* __restrict__ sharedMemory, const uint32_t* __restrict__ in, uint32_t* __restrict__ out, const uint32_t* __restrict__ expandedKey)
 {
 	round(sharedMemory, out, in, expandedKey);
 }
 
-__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey)
+__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t* __restrict__ sharedMemory, uint32_t* __restrict__ val, const uint32_t* __restrict__ expandedKey)
 {
 	uint32_t b1[4];
 	round(sharedMemory, b1, val, expandedKey);
@@ -298,14 +578,35 @@ __device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t *
 	round(sharedMemory, val, b1, expandedKey + 9 * N_COLS);
 }
 
-__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
+__device__ __forceinline__ static uint4 cn_aes_pseudo_round_mut32(const uint32_t* __restrict__ sharedMemory, uint4  val, const uint4* __restrict__ expandedKey)
+{
+	uint4 b1 = round32(sharedMemory, val, *expandedKey);
+	val = round32(sharedMemory, b1, expandedKey[1]);
+	b1 = round32(sharedMemory, val, expandedKey[2]);
+	val = round32(sharedMemory, b1, expandedKey[3]);
+	b1 = round32(sharedMemory, val, expandedKey[4]);
+	val = round32(sharedMemory, b1, expandedKey[5]);
+	b1 = round32(sharedMemory, val, expandedKey[6]);
+	val = round32(sharedMemory, b1, expandedKey[7]);
+	b1 = round32(sharedMemory, val, expandedKey[8]);
+	val = round32(sharedMemory, b1, expandedKey[9]);
+	return val;
+}
+
+__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t* sharedMemory)
 {
 	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
 		sharedMemory[i] = d_t_fn[i];
 }
 
-__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory)
+__device__ __forceinline__ static void cn_aes_gpu_init32(uint32_t* sharedMemory)
 {
-        for(int i = threadIdx.x; i < 512; i += blockDim.x)
-                sharedMemory[i] = d_t_fn[i];
+	for(int i = threadIdx.x; i < 256 * 32; i += blockDim.x)
+		sharedMemory[i] = d_t_fn256[i];
+}
+
+__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t* sharedMemory)
+{
+	for(int i = threadIdx.x; i < 512; i += blockDim.x)
+		sharedMemory[i] = d_t_fn[i];
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
index 611fe1c8c..efd57c944 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
@@ -1,64 +1,68 @@
 #pragma once
 
-typedef struct {
+#include "cuda_extra.hpp"
+
+typedef struct
+{
 	uint32_t h[8], s[4], t[2];
 	uint32_t buflen;
 	int nullt;
 	uint8_t buf[64];
 } blake_state;
 
-#define U8TO32(p) \
+#define U8TO32(p)                                              \
 	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
-	((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
-
-#define U32TO8(p, v) \
-	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
-	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
-
-#define BLAKE_ROT(x,n) ROTR32(x, n)
-#define BLAKE_G(a,b,c,d,e) \
-	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \
-	v[d] = BLAKE_ROT(v[d] ^ v[a],16); \
-	v[c] += v[d];                     \
-	v[b] = BLAKE_ROT(v[b] ^ v[c],12); \
-	v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \
-	v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \
-	v[c] += v[d];                     \
+		((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3])))
+
+#define U32TO8(p, v)               \
+	(p)[0] = (uint8_t)((v) >> 24); \
+	(p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >> 8);  \
+	(p)[3] = (uint8_t)((v));
+
+#define BLAKE_ROT(x, n) ROTR32(x, n)
+#define BLAKE_G(a, b, c, d, e)                                                      \
+	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e + 1]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a], 16);                                              \
+	v[c] += v[d];                                                                   \
+	v[b] = BLAKE_ROT(v[b] ^ v[c], 12);                                              \
+	v[a] += (m[d_blake_sigma[i][e + 1]] ^ d_blake_cst[d_blake_sigma[i][e]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a], 8);                                               \
+	v[c] += v[d];                                                                   \
 	v[b] = BLAKE_ROT(v[b] ^ v[c], 7);
 
 __constant__ uint8_t d_blake_sigma[14][16] =
-{
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
-	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
-	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
-	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
-	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
-	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}
-};
-__constant__ uint32_t d_blake_cst[16]
-= {
+	{
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+		{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+		{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+		{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+		{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+		{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+		{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+		{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+		{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+		{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+		{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}};
+__constant__ uint32_t d_blake_cst[16] = {
 	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
 	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
 	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
-	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
-};
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917};
 
-__device__ void cn_blake_compress(blake_state *  S, const uint8_t *  block)
+__device__ void cn_blake_compress(blake_state* S, const uint8_t* block)
 {
 	uint32_t v[16], m[16], i;
 
-	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
-	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
-	v[ 8] = S->s[0] ^ 0x243F6A88;
-	v[ 9] = S->s[1] ^ 0x85A308D3;
+	for(i = 0; i < 16; ++i)
+		m[i] = U8TO32(block + i * 4);
+	for(i = 0; i < 8; ++i)
+		v[i] = S->h[i];
+	v[8] = S->s[0] ^ 0x243F6A88;
+	v[9] = S->s[1] ^ 0x85A308D3;
 	v[10] = S->s[2] ^ 0x13198A2E;
 	v[11] = S->s[3] ^ 0x03707344;
 	v[12] = 0xA4093822;
@@ -66,7 +70,7 @@ __device__ void cn_blake_compress(blake_state *  S, const uint8_t *  block)
 	v[14] = 0x082EFA98;
 	v[15] = 0xEC4E6C89;
 
-	if (S->nullt == 0)
+	if(S->nullt == 0)
 	{
 		v[12] ^= S->t[0];
 		v[13] ^= S->t[0];
@@ -74,50 +78,54 @@ __device__ void cn_blake_compress(blake_state *  S, const uint8_t *  block)
 		v[15] ^= S->t[1];
 	}
 
-	for (i = 0; i < 14; ++i)
+	for(i = 0; i < 14; ++i)
 	{
-		BLAKE_G(0, 4,  8, 12,  0);
-		BLAKE_G(1, 5,  9, 13,  2);
-		BLAKE_G(2, 6, 10, 14,  4);
-		BLAKE_G(3, 7, 11, 15,  6);
-		BLAKE_G(3, 4,  9, 14, 14);
-		BLAKE_G(2, 7,  8, 13, 12);
-		BLAKE_G(0, 5, 10, 15,  8);
+		BLAKE_G(0, 4, 8, 12, 0);
+		BLAKE_G(1, 5, 9, 13, 2);
+		BLAKE_G(2, 6, 10, 14, 4);
+		BLAKE_G(3, 7, 11, 15, 6);
+		BLAKE_G(3, 4, 9, 14, 14);
+		BLAKE_G(2, 7, 8, 13, 12);
+		BLAKE_G(0, 5, 10, 15, 8);
 		BLAKE_G(1, 6, 11, 12, 10);
 	}
 
-	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
-	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+	for(i = 0; i < 16; ++i)
+		S->h[i % 8] ^= v[i];
+	for(i = 0; i < 8; ++i)
+		S->h[i] ^= S->s[i % 4];
 }
 
-__device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_t datalen)
+__device__ void cn_blake_update(blake_state* S, const uint8_t* data, uint64_t datalen)
 {
 	uint32_t left = S->buflen >> 3;
 	uint32_t fill = 64 - left;
 
-	if (left && (((datalen >> 3) & 0x3F) >= fill))
+	if(left && (((datalen >> 3) & 0x3F) >= fill))
 	{
-		memcpy((void *) (S->buf + left), (void *) data, fill);
+		memcpy((void*)(S->buf + left), (void*)data, fill);
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		cn_blake_compress(S, S->buf);
 		data += fill;
 		datalen -= (fill << 3);
 		left = 0;
 	}
 
-	while (datalen >= 512)
+	while(datalen >= 512)
 	{
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		cn_blake_compress(S, data);
 		data += 64;
 		datalen -= 512;
 	}
 
-	if (datalen > 0)
+	if(datalen > 0)
 	{
-		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+		memcpy((void*)(S->buf + left), (void*)data, datalen >> 3);
 		S->buflen = (left << 3) + datalen;
 	}
 	else
@@ -126,31 +134,32 @@ __device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_
 	}
 }
 
-__device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
+__device__ void cn_blake_final(blake_state* S, uint8_t* digest)
 {
 	const uint8_t padding[] =
-	{
-		0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-	};
+		{
+			0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
 	uint8_t pa = 0x81, pb = 0x01;
 	uint8_t msglen[8];
 	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
-	if (lo < (unsigned) S->buflen) hi++;
+	if(lo < (unsigned)S->buflen)
+		hi++;
 	U32TO8(msglen + 0, hi);
 	U32TO8(msglen + 4, lo);
 
-	if (S->buflen == 440)
+	if(S->buflen == 440)
 	{
 		S->t[0] -= 8;
 		cn_blake_update(S, &pa, 8);
 	}
 	else
 	{
-		if (S->buflen < 440)
+		if(S->buflen < 440)
 		{
-			if (S->buflen == 0) S->nullt = 1;
+			if(S->buflen == 0)
+				S->nullt = 1;
 			S->t[0] -= 440 - S->buflen;
 			cn_blake_update(S, padding, 440 - S->buflen);
 		}
@@ -168,9 +177,9 @@ __device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
 	S->t[0] -= 64;
 	cn_blake_update(S, msglen, 64);
 
-	U32TO8(digest +  0, S->h[0]);
-	U32TO8(digest +  4, S->h[1]);
-	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 0, S->h[0]);
+	U32TO8(digest + 4, S->h[1]);
+	U32TO8(digest + 8, S->h[2]);
 	U32TO8(digest + 12, S->h[3]);
 	U32TO8(digest + 16, S->h[4]);
 	U32TO8(digest + 20, S->h[5]);
@@ -178,17 +187,22 @@ __device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
 	U32TO8(digest + 28, S->h[7]);
 }
 
-__device__ void cn_blake(const uint8_t *  in, uint64_t inlen, uint8_t *  out)
+__device__ void cn_blake(const uint8_t* in, uint64_t inlen, uint8_t* out)
 {
 	blake_state bs;
-	blake_state *S = (blake_state *)&bs;
-
-	S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372;
-	S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C;
-	S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19;
+	blake_state* S = (blake_state*)&bs;
+
+	S->h[0] = 0x6A09E667;
+	S->h[1] = 0xBB67AE85;
+	S->h[2] = 0x3C6EF372;
+	S->h[3] = 0xA54FF53A;
+	S->h[4] = 0x510E527F;
+	S->h[5] = 0x9B05688C;
+	S->h[6] = 0x1F83D9AB;
+	S->h[7] = 0x5BE0CD19;
 	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
 	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
 
-	cn_blake_update(S, (uint8_t *)in, inlen * 8);
-	cn_blake_final(S, (uint8_t *)out);
+	cn_blake_update(S, (uint8_t*)in, inlen * 8);
+	cn_blake_final(S, (uint8_t*)out);
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 718cff0c7..6c769b3e8 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -1,55 +1,55 @@
 #include "xmrstak/backend/cryptonight.hpp"
 
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
+#include <bitset>
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <bitset>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
 
-#include "xmrstak/jconf.hpp"
-#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
-#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
-#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp"
 #include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
-
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
+#include "xmrstak/jconf.hpp"
 
 #ifdef _WIN32
 #include <windows.h>
 extern "C" void compat_usleep(uint64_t waitTime)
 {
-    if (waitTime > 0)
-    {
-        if (waitTime > 100)
-        {
-            // use a waitable timer for larger intervals > 0.1ms
-
-            HANDLE timer;
-            LARGE_INTEGER ft;
-
-            ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time
-
-            timer = CreateWaitableTimer(NULL, TRUE, NULL);
-            SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
-            WaitForSingleObject(timer, INFINITE);
-            CloseHandle(timer);
-        }
-        else
-        {
-            // use a polling loop for short intervals <= 100ms
-
-            LARGE_INTEGER perfCnt, start, now;
-            __int64 elapsed;
-
-            QueryPerformanceFrequency(&perfCnt);
-            QueryPerformanceCounter(&start);
-            do {
-		SwitchToThread();
-                QueryPerformanceCounter((LARGE_INTEGER*) &now);
-                elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000);
-            } while ( elapsed < waitTime );
-        }
-    }
+	if(waitTime > 0)
+	{
+		if(waitTime > 100)
+		{
+			// use a waitable timer for larger intervals > 0.1ms
+
+			HANDLE timer;
+			LARGE_INTEGER ft;
+
+			ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time
+
+			timer = CreateWaitableTimer(NULL, TRUE, NULL);
+			SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
+			WaitForSingleObject(timer, INFINITE);
+			CloseHandle(timer);
+		}
+		else
+		{
+			// use a polling loop for short intervals <= 100ms
+
+			LARGE_INTEGER perfCnt, start, now;
+			__int64 elapsed;
+
+			QueryPerformanceFrequency(&perfCnt);
+			QueryPerformanceCounter(&start);
+			do
+			{
+				SwitchToThread();
+				QueryPerformanceCounter((LARGE_INTEGER*)&now);
+				elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000);
+			} while(elapsed < waitTime);
+		}
+	}
 }
 #else
 #include <unistd.h>
@@ -60,9 +60,9 @@ extern "C" void compat_usleep(uint64_t waitTime)
 #endif
 
 #include "cryptonight.hpp"
-#include "cuda_extra.hpp"
 #include "cuda_aes.hpp"
 #include "cuda_device.hpp"
+#include "cuda_extra.hpp"
 
 /* sm_2X is limited to 2GB due to the small TLB
  * therefore we never use 64bit indices
@@ -73,106 +73,56 @@ typedef uint64_t IndexType;
 typedef int IndexType;
 #endif
 
-__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi )
+__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi)
 {
-	product_hi = __umul64hi( multiplier, multiplicand );
-	return (multiplier * multiplicand );
-}
-
-template< typename T >
-__device__ __forceinline__ T loadGlobal64( T * const addr )
-{
-#if (__CUDA_ARCH__ < 700)
-	T x;
-	asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) );
-	return x;
-#else
-	return *addr;
-#endif
-}
-
-template< typename T >
-__device__ __forceinline__ T loadGlobal32( T * const addr )
-{
-#if (__CUDA_ARCH__ < 700)
-	T x;
-	asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) );
-	return x;
-#else
-	return *addr;
-#endif
-}
-
-
-template< typename T >
-__device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
-{
-#if (__CUDA_ARCH__ < 700)
-	asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
-#else
-	*addr = val;
-#endif
-}
-
-template< typename T >
-__device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
-{
-#if (__CUDA_ARCH__ < 700)
-	asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) );
-#else
-	*addr = val;
-#endif
-}
-
-__device__ __forceinline__ uint32_t rotate16( const uint32_t n )
-{
-	return (n >> 16u) | (n << 16u);
+	product_hi = __umul64hi(multiplier, multiplicand);
+	return (multiplier * multiplicand);
 }
 
 __global__ void cryptonight_core_gpu_phase1(
-	const uint32_t ITERATIONS,  const size_t MEMORY,
-	int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
+	const uint32_t ITERATIONS, const size_t MEMORY,
+	int threads, int bfactor, int partidx, uint32_t* __restrict__ long_state, uint32_t* __restrict__ ctx_state2, uint32_t* __restrict__ ctx_key1)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	cn_aes_gpu_init( sharedMemory );
-	__syncthreads( );
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
 
-	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
-	const int sub = ( threadIdx.x & 7 ) << 2;
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	const int sub = (threadIdx.x & 7) << 2;
 
 	const int batchsize = MEMORY >> bfactor;
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	uint32_t key[40], text[4];
 
-	MEMCPY8( key, ctx_key1 + thread * 40, 20 );
+	MEMCPY8(key, ctx_key1 + thread * 40, 20);
 
-	if( partidx == 0 )
+	if(partidx == 0)
 	{
 		// first round
-		MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 );
+		MEMCPY8(text, ctx_state2 + thread * 50 + sub + 16, 2);
 	}
 	else
 	{
 		// load previous text data
-		MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 );
+		MEMCPY8(text, &long_state[((uint64_t)thread * MEMORY) + sub + start - 32], 2);
 	}
-	__syncthreads( );
-	for ( int i = start; i < end; i += 32 )
+	__syncthreads();
+	for(int i = start; i < end; i += 32)
 	{
-		cn_aes_pseudo_round_mut( sharedMemory, text, key );
-		MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2);
+		cn_aes_pseudo_round_mut(sharedMemory, text, key);
+		MEMCPY8(&long_state[((uint64_t)thread * MEMORY) + (sub + i)], text, 2);
 	}
 }
 
 /** avoid warning `unused parameter` */
-template< typename T >
-__forceinline__ __device__ void unusedVar( const T& )
+template <typename T>
+__forceinline__ __device__ void unusedVar(const T&)
 {
 }
 
@@ -189,25 +139,25 @@ __forceinline__ __device__ void unusedVar( const T& )
  * @param value value to share with other threads within the group
  * @param src thread number within the group from where the data is read, range [0:group_n]
  */
-template<size_t group_n>
-__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
+template <size_t group_n>
+__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src)
 {
-#if( __CUDA_ARCH__ < 300 )
-    ptr[sub] = val;
-    return ptr[src & (group_n-1)];
+#if(__CUDA_ARCH__ < 300)
+	ptr[sub] = val;
+	return ptr[src & (group_n - 1)];
 #else
-    unusedVar( ptr );
-    unusedVar( sub );
-#   if(__CUDACC_VER_MAJOR__ >= 9)
-    return __shfl_sync(__activemask(), val, src, group_n );
-#	else
-	return __shfl( val, src, group_n );
-#	endif
+	unusedVar(ptr);
+	unusedVar(sub);
+#if(__CUDACC_VER_MAJOR__ >= 9)
+	return __shfl_sync(__activemask(), val, src, group_n);
+#else
+	return __shfl(val, src, group_n);
+#endif
 #endif
 }
 
-template<size_t group_n>
-__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2)
+template <size_t group_n>
+__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src, const uint32_t src2)
 {
 	uint64_t tmp;
 	((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, val, src);
@@ -218,9 +168,9 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3
 struct u64 : public uint2
 {
 
-	__forceinline__ __device__ u64(){}
+	__forceinline__ __device__ u64() {}
 
-	__forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1)
+	__forceinline__ __device__ u64(const uint32_t x0, const uint32_t x1)
 	{
 		uint2::x = x0;
 		uint2::y = x1;
@@ -231,7 +181,7 @@ struct u64 : public uint2
 		return *((uint64_t*)this);
 	}
 
-	__forceinline__ __device__ u64( const uint64_t x0)
+	__forceinline__ __device__ u64(const uint64_t x0)
 	{
 		((uint64_t*)&this->x)[0] = x0;
 	}
@@ -259,7 +209,7 @@ struct u64 : public uint2
 
 	__forceinline__ __device__ void print(int i) const
 	{
-		if(i<2)
+		if(i < 2)
 			printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]);
 	}
 };
@@ -269,42 +219,42 @@ struct u64 : public uint2
  * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory
  *                   else if `1` 256bit operations will be used
  */
-template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+template <xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 #ifdef XMR_STAK_THREADS
-__launch_bounds__( XMR_STAK_THREADS * 2 )
+__launch_bounds__(XMR_STAK_THREADS * 2)
 #endif
-__global__ void cryptonight_core_gpu_phase2_double(
-	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
-	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
-		uint32_t startNonce, uint32_t * __restrict__ d_input )
+	__global__ void cryptonight_core_gpu_phase2_double(
+		const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK,
+		int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state,
+		uint32_t startNonce, uint32_t* __restrict__ d_input)
 {
 	__shared__ uint32_t sharedMemory[512];
 
-	cn_aes_gpu_init_half( sharedMemory );
+	cn_aes_gpu_init_half(sharedMemory);
 
-#if( __CUDA_ARCH__ < 300 )
+#if(__CUDA_ARCH__ < 300)
 	extern __shared__ uint64_t externShared[];
 	// 8 x 64bit values
 	volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8);
-    volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8)  + (threadIdx.x & 0xFFFFFFFE);
+	volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE);
 #else
 	extern __shared__ uint64_t chunkMem[];
-    volatile uint32_t* sPtr = NULL;
+	volatile uint32_t* sPtr = NULL;
 	// 8 x 64bit values
 	volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8);
 
 #endif
 
-	__syncthreads( );
+	__syncthreads();
 
 	const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x);
 	const uint32_t thread = tid >> 1;
 	const uint32_t sub = tid & 1;
 
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
-	uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY];
+	uint8_t* l0 = (uint8_t*)&d_long_state[(IndexType)thread * MEMORY];
 
 	uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub];
 	uint64_t bx0;
@@ -324,22 +274,22 @@ __global__ void cryptonight_core_gpu_phase2_double(
 		sqrt_result = (d_ctx_b + thread * 16 + 4 * 2 + 2)[0];
 	}
 	else
-		 bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
+		bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
 
-	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+	const int batchsize = (ITERATIONS * 2) >> (1 + bfactor);
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 
 	for(int i = start; i < end; ++i)
 	{
-		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+		ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0];
 
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				myChunks[x + sub] = ptr0[ x + sub ];
+				myChunks[x + sub] = ptr0[x + sub];
 			}
 		}
 		else
@@ -347,52 +297,51 @@ __global__ void cryptonight_core_gpu_phase2_double(
 
 		uint32_t idx1 = (idx0 & 0x30) >> 3;
 
-		const u64 cx = myChunks[ idx1 + sub ];
-		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
+		const u64 cx = myChunks[idx1 + sub];
+		const u64 cx2 = myChunks[idx1 + ((sub + 1) & 1)];
 
 		u64 cx_aes = ax0 ^ u64(
-			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )),
-			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) ))
-		);
+			t_fn0(BYTE_0(cx.x)) ^ t_fn1(BYTE_1(cx.y)) ^ ROTL32_16(t_fn0(BYTE_2(cx2.x)) ^ t_fn1(BYTE_3(cx2.y))),
+			t_fn0(BYTE_0(cx.y)) ^ t_fn1(BYTE_1(cx2.x)) ^ ROTL32_16(t_fn0(BYTE_2(cx2.y)) ^ t_fn1(BYTE_3(cx.x))));
 
 		if(ALGO == cryptonight_monero_v8)
 		{
 
-			const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
-			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
-			const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+			const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub];
+			const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
+			const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 			__syncwarp();
 #else
-			__syncthreads( );
+			__syncthreads();
 #endif
-			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+			myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+			myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 		}
 		else if(ALGO == cryptonight_v8_reversewaltz)
 		{
 
-			const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ];
-			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
-			const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+			const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub];
+			const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
+			const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 			__syncwarp();
 #else
-			__syncthreads( );
+			__syncthreads();
 #endif
-			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+			myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+			myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 		}
 
-		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
+		myChunks[idx1 + sub] = cx_aes ^ bx0;
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				ptr0[ x + sub ] = myChunks[x + sub];
+				ptr0[x + sub] = myChunks[x + sub];
 			}
 		}
 		else
@@ -400,14 +349,14 @@ __global__ void cryptonight_core_gpu_phase2_double(
 
 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
-		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+		ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0];
 
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				myChunks[x + sub] = ptr0[ x + sub ];
+				myChunks[x + sub] = ptr0[x + sub];
 			}
 		}
 		else
@@ -417,15 +366,15 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			bx0 = cx_aes;
 
 		uint64_t cx_mul;
-		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
-		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
+		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x, 0);
+		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y, 0);
 
 		if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && sub == 1)
 		{
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			((uint32_t*)&division_result)[1] ^= sqrt_result;
 
-			((uint64_t*)myChunks)[ idx1 ] ^= division_result;
+			((uint64_t*)myChunks)[idx1] ^= division_result;
 
 			const uint32_t dd = (static_cast<uint32_t>(cx_mul) + (sqrt_result << 1)) | 0x80000001UL;
 			division_result = fast_div_v2(cx_aes, dd);
@@ -433,46 +382,46 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			// Use division_result as an input for the square root to prevent parallel implementation in hardware
 			sqrt_result = fast_sqrt_v2(cx_mul + division_result);
 		}
-#if (__CUDACC_VER_MAJOR__ >= 9)
-				__syncwarp();
+#if(__CUDACC_VER_MAJOR__ >= 9)
+		__syncwarp();
 #else
-				__syncthreads( );
+		__syncthreads();
 #endif
-		uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ];
+		uint64_t c = ((uint64_t*)myChunks)[idx1 + sub];
 
 		{
-			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
+			uint64_t cl = ((uint64_t*)myChunks)[idx1];
 			// sub 0 -> hi, sub 1 -> lo
-			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
+			uint64_t res = sub == 0 ? __umul64hi(cx_mul, cl) : cx_mul * cl;
 			if(ALGO == cryptonight_monero_v8)
 			{
-				const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
-				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub] ^ res;
+				uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
 				res ^= ((uint64_t*)&chunk2)[0];
-				const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+				const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 				__syncwarp();
 #else
-				__syncthreads( );
+				__syncthreads();
 #endif
-				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+				myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+				myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+				myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 			}
 			if(ALGO == cryptonight_v8_reversewaltz)
 			{
-				const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ] ^ res;
-				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub] ^ res;
+				uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
 				res ^= ((uint64_t*)&chunk2)[0];
-				const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+				const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 				__syncwarp();
 #else
-				__syncthreads( );
+				__syncthreads();
 #endif
-				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+				myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+				myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+				myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 			}
 			ax0 += res;
 		}
@@ -481,13 +430,13 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			bx1 = bx0;
 			bx0 = cx_aes;
 		}
-		myChunks[ idx1 + sub ] = ax0;
+		myChunks[idx1 + sub] = ax0;
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				ptr0[ x + sub ] = myChunks[x + sub];
+				ptr0[x + sub] = myChunks[x + sub];
 			}
 		}
 		else
@@ -496,7 +445,7 @@ __global__ void cryptonight_core_gpu_phase2_double(
 		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
 	}
 
-	if ( bfactor > 0 )
+	if(bfactor > 0)
 	{
 		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
 		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
@@ -516,41 +465,41 @@ __global__ void cryptonight_core_gpu_phase2_double(
 	}
 }
 
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 #ifdef XMR_STAK_THREADS
-__launch_bounds__( XMR_STAK_THREADS * 4 )
+__launch_bounds__(XMR_STAK_THREADS * 4)
 #endif
-__global__ void cryptonight_core_gpu_phase2_quad(
-	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
-	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
-		uint32_t startNonce, uint32_t * __restrict__ d_input )
+	__global__ void cryptonight_core_gpu_phase2_quad(
+		const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK,
+		int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state,
+		uint32_t startNonce, uint32_t* __restrict__ d_input)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	cn_aes_gpu_init( sharedMemory );
+	cn_aes_gpu_init(sharedMemory);
 
-	__syncthreads( );
+	__syncthreads();
 
-	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
 	const uint32_t nonce = startNonce + thread;
 	const int sub = threadIdx.x & 3;
 	const int sub2 = sub & 2;
 
-#if( __CUDA_ARCH__ < 300 )
-        extern __shared__ uint32_t shuffleMem[];
-        volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
+#if(__CUDA_ARCH__ < 300)
+	extern __shared__ uint32_t shuffleMem[];
+	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFFC));
 #else
-        volatile uint32_t* sPtr = NULL;
+	volatile uint32_t* sPtr = NULL;
 #endif
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	int i, k;
 	uint32_t j;
-	const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
+	const int batchsize = (ITERATIONS * 2) >> (2 + bfactor);
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
-	uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
+	uint32_t* long_state = &d_long_state[(IndexType)thread * MEMORY];
 	uint32_t a, d[2], idx0;
 	uint32_t t1[2], t2[2], res;
 
@@ -564,9 +513,9 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 	}
 
 	uint32_t tweak1_2[2];
-	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 	{
-		uint32_t * state = d_ctx_state + thread * 50;
+		uint32_t* state = d_ctx_state + thread * 50;
 		tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8);
 		tweak1_2[0] ^= state[48];
 		tweak1_2[1] = nonce;
@@ -574,7 +523,7 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 	}
 
 	a = (d_ctx_a + thread * 4)[sub];
-	idx0 = shuffle<4>(sPtr,sub, a, 0);
+	idx0 = shuffle<4>(sPtr, sub, a, 0);
 	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		if(partidx != 0)
@@ -585,33 +534,33 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 	}
 	d[1] = (d_ctx_b + thread * 4)[sub];
 
-	#pragma unroll 2
-	for ( i = start; i < end; ++i )
+#pragma unroll 2
+	for(i = start; i < end; ++i)
 	{
-		#pragma unroll 2
-		for ( int x = 0; x < 2; ++x )
+#pragma unroll 2
+		for(int x = 0; x < 2; ++x)
 		{
-			j = ( ( idx0 & MASK ) >> 2 ) + sub;
+			j = ((idx0 & MASK) >> 2) + sub;
 
 			if(ALGO == cryptonight_bittube2)
 			{
 				uint32_t k[4];
-				k[0] = ~loadGlobal32<uint32_t>( long_state + j );
-				k[1] = shuffle<4>(sPtr,sub, k[0], sub + 1);
-				k[2] = shuffle<4>(sPtr,sub, k[0], sub + 2);
-				k[3] = shuffle<4>(sPtr,sub, k[0], sub + 3);
+				k[0] = ~loadGlobal32<uint32_t>(long_state + j);
+				k[1] = shuffle<4>(sPtr, sub, k[0], sub + 1);
+				k[2] = shuffle<4>(sPtr, sub, k[0], sub + 2);
+				k[3] = shuffle<4>(sPtr, sub, k[0], sub + 3);
 
-				#pragma unroll 4
+#pragma unroll 4
 				for(int i = 0; i < 4; ++i)
 				{
 					// only calculate the key if all data are up to date
 					if(i == sub)
 					{
 						d[x] = a ^
-							t_fn0( k[0] & 0xff ) ^
-							t_fn1( (k[1] >> 8) & 0xff ) ^
-							t_fn2( (k[2] >> 16) & 0xff ) ^
-							t_fn3( (k[3] >> 24 ) );
+							   t_fn0(BYTE_0(k[0])) ^
+							   t_fn1(BYTE_1(k[1])) ^
+							   t_fn2(BYTE_2(k[2])) ^
+							   t_fn3(BYTE_3(k[3]));
 					}
 					// the last shuffle is not needed
 					if(i != 3)
@@ -619,13 +568,13 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 						/* avoid negative number for modulo
 						 * load valid key (k) depending on the round
 						 */
-						k[(4 - sub + i)%4] = shuffle<4>(sPtr,sub, k[0] ^ d[x], i);
+						k[(4 - sub + i) % 4] = shuffle<4>(sPtr, sub, k[0] ^ d[x], i);
 					}
 				}
 			}
 			else
 			{
-				uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
+				uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
 
 				if(ALGO == cryptonight_conceal)
 				{
@@ -642,18 +591,18 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 					x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old));
 				}
 
-				const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
-				const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
-				const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
+				const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1);
+				const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2);
+				const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3);
 				d[x] = a ^
-					t_fn0( x_0 & 0xff ) ^
-					t_fn1( (x_1 >> 8) & 0xff ) ^
-					t_fn2( (x_2 >> 16) & 0xff ) ^
-					t_fn3( ( x_3 >> 24 ) );
+					   t_fn0(BYTE_0(x_0)) ^
+					   t_fn1(BYTE_1(x_1)) ^
+					   t_fn2(BYTE_2(x_2)) ^
+					   t_fn3(BYTE_3(x_3));
 			}
 
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
-			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
+			t1[0] = shuffle<4>(sPtr, sub, d[x], 0);
 
 			const uint32_t z = d[0] ^ d[1];
 			if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
@@ -663,157 +612,178 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 				{
 					const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2);
 					const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
-					storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+					storeGlobal32(long_state + j, sub == 2 ? fork_7 : z);
 				}
 				else if(ALGO == cryptonight_stellite)
 				{
 					const uint32_t index = ((z >> 27) & 12) | ((z >> 23) & 2);
 					const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
-					storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+					storeGlobal32(long_state + j, sub == 2 ? fork_7 : z);
 				}
 			}
 			else
-				storeGlobal32( long_state + j, z );
+				storeGlobal32(long_state + j, z);
 
 			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
-			j = ( ( *t1 & MASK ) >> 2 ) + sub;
+			j = ((*t1 & MASK) >> 2) + sub;
 
 			uint32_t yy[2];
-			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
+			*((uint64_t*)yy) = loadGlobal64<uint64_t>(((uint64_t*)long_state) + (j >> 1));
 			uint32_t zz[2];
-			zz[0] = shuffle<4>(sPtr,sub, yy[0], 0);
-			zz[1] = shuffle<4>(sPtr,sub, yy[1], 0);
+			zz[0] = shuffle<4>(sPtr, sub, yy[0], 0);
+			zz[1] = shuffle<4>(sPtr, sub, yy[1], 0);
 
-			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
-			#pragma unroll
-			for ( k = 0; k < 2; k++ )
-				t2[k] = shuffle<4>(sPtr,sub, a, k + sub2);
+			t1[1] = shuffle<4>(sPtr, sub, d[x], 1);
+#pragma unroll
+			for(k = 0; k < 2; k++)
+				t2[k] = shuffle<4>(sPtr, sub, a, k + sub2);
 
-            *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) );
+			*((uint64_t*)t2) += sub2 ? (*((uint64_t*)t1) * *((uint64_t*)zz)) : __umul64hi(*((uint64_t*)t1), *((uint64_t*)zz));
 
-			res = *( (uint64_t *) t2 )  >> ( sub & 1 ? 32 : 0 );
+			res = *((uint64_t*)t2) >> (sub & 1 ? 32 : 0);
 
 			if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 			{
 				const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res;
 				uint32_t long_state_update = sub2 ? tweaked_res : res;
 
-				if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
+				if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
 				{
-					uint32_t value = shuffle<4>(sPtr,sub, long_state_update, sub & 1) ^ long_state_update;
+					uint32_t value = shuffle<4>(sPtr, sub, long_state_update, sub & 1) ^ long_state_update;
 					long_state_update = sub >= 2 ? value : long_state_update;
 				}
 
-				storeGlobal32( long_state + j, long_state_update );
+				storeGlobal32(long_state + j, long_state_update);
 			}
 			else
-				storeGlobal32( long_state + j, res );
+				storeGlobal32(long_state + j, res);
 
-			a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
-			idx0 = shuffle<4>(sPtr,sub, a, 0);
+			a = (sub & 1 ? yy[1] : yy[0]) ^ res;
+			idx0 = shuffle<4>(sPtr, sub, a, 0);
 			if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
 			{
-				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
-				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
+				int64_t n = loadGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3));
+				int32_t d = loadGlobal32<uint32_t>((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u));
 				int64_t q = fast_div_heavy(n, (d | 0x5));
 
-				if(sub&1)
-					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
+				if(sub & 1)
+					storeGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q);
 
 				idx0 = d ^ q;
 			}
 			else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast)
 			{
-				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
-				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
+				int64_t n = loadGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3));
+				int32_t d = loadGlobal32<uint32_t>((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u));
 				int64_t q = fast_div_heavy(n, (d | 0x5));
 
-				if(sub&1)
-					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
+				if(sub & 1)
+					storeGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q);
 
 				idx0 = (~d) ^ q;
 			}
 		}
 	}
 
-	if ( bfactor > 0 )
+	if(bfactor > 0)
 	{
 		(d_ctx_a + thread * 4)[sub] = a;
 		(d_ctx_b + thread * 4)[sub] = d[1];
 		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-			if(sub&1)
+			if(sub & 1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
 		if(ALGO == cryptonight_conceal)
 			*(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var);
 	}
 }
 
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 __global__ void cryptonight_core_gpu_phase3(
-	const uint32_t ITERATIONS,  const size_t MEMORY,
-	int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
+	const uint32_t ITERATIONS, const size_t MEMORY,
+	int threads, int bfactor, int partidx, uint32_t* long_stateIn, const uint32_t* const __restrict__ d_ctx_stateIn, uint32_t* __restrict__ d_ctx_key2)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t sharedMemoryX[256 * 32];
 
-	cn_aes_gpu_init( sharedMemory );
-	__syncthreads( );
+	/* avoid that the compiler is later in the aes round optimizing `sharedMemory[ x * 32 ]` to `sharedMemoryX + x * 32 + twidx`*/
+	const int twidx = (threadIdx.x * 4) % 128;
+	// this is equivalent to `(uint32_t*)sharedMemoryX + twidx;` where `twidx` is [0;32)
+	char* sharedMemory = (char*)sharedMemoryX + twidx;
 
-	int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
-	int subv = ( threadIdx.x & 7 );
+	cn_aes_gpu_init32(sharedMemoryX);
+	__syncthreads();
+
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	int subv = (threadIdx.x & 7);
 	int sub = subv << 2;
 
 	const int batchsize = MEMORY >> bfactor;
 	const int start = (partidx % (1 << bfactor)) * batchsize;
 	const int end = start + batchsize;
 
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
+	const uint32_t* const long_state = long_stateIn + ((IndexType)thread * MEMORY) + sub;
+
 	uint32_t key[40], text[4];
-	MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
-	MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 );
+	#pragma unroll 10
+	for(int j = 0; j < 10; ++j)
+		((ulonglong4*)key)[j] = ((ulonglong4*)(d_ctx_key2 + thread * 40))[j];
 
-	__syncthreads( );
+	uint64_t* d_ctx_state = (uint64_t*)(d_ctx_stateIn + thread * 50 + sub + 16);
+	#pragma unroll 2
+	for(int j = 0; j < 2; ++j)
+		((uint64_t*)text)[j] = loadGlobal64<uint64_t>(d_ctx_state + j);
+
+	__syncthreads();
 
-#if( __CUDA_ARCH__ < 300 )
+#if(__CUDA_ARCH__ < 300)
 	extern __shared__ uint32_t shuffleMem[];
-	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8));
+	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFF8));
 #else
 	volatile uint32_t* sPtr = NULL;
 #endif
 
-	for ( int i = start; i < end; i += 32 )
+	for(int i = start; i < end; i += 32)
 	{
-		#pragma unroll
-		for ( int j = 0; j < 4; ++j )
-			text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
+		uint32_t tmp[4];
+		((ulonglong2*)(tmp))[0] =  ((ulonglong2*)(long_state + i))[0];
+		#pragma unroll 4
+		for(int j = 0; j < 4; ++j)
+			text[j] ^= tmp[j];
 
-		cn_aes_pseudo_round_mut( sharedMemory, text, key );
+		((uint4*)text)[0] = cn_aes_pseudo_round_mut32((uint32_t*)sharedMemory, ((uint4*)text)[0], (uint4*)key);
 
 		if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
 			ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 		{
-			#pragma unroll
-			for ( int j = 0; j < 4; ++j )
-				text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
+			uint32_t tmp[4];
+			#pragma unroll 4
+			for(int j = 0; j < 4; ++j)
+				tmp[j] = shuffle<8>(sPtr, subv, text[j], (subv + 1) & 7);
+			#pragma unroll 4
+			for(int j = 0; j < 4; ++j)
+				text[j] ^= tmp[j];
 		}
 	}
 
-	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
+	#pragma unroll 2
+	for(int j = 0; j < 2; ++j)
+		storeGlobal64<uint64_t>(d_ctx_state + j, ((uint64_t*)text)[j]);
 }
 
-template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+template <xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
 {
 	uint32_t MASK = algo.Mask();
 	uint32_t ITERATIONS = algo.Iter();
-	size_t MEM = algo.Mem()/4;
+	size_t MEM = algo.Mem() / 4;
 
-	dim3 grid( ctx->device_blocks );
-	dim3 block( ctx->device_threads );
-	dim3 block2( ctx->device_threads << 1 );
-	dim3 block4( ctx->device_threads << 2 );
-	dim3 block8( ctx->device_threads << 3 );
+	dim3 grid(ctx->device_blocks);
+	dim3 block(ctx->device_threads);
+	dim3 block2(ctx->device_threads << 1);
+	dim3 block4(ctx->device_threads << 2);
+	dim3 block8(ctx->device_threads << 3);
 
 	int partcount = 1 << ctx->device_bfactor;
 
@@ -823,27 +793,29 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 	 * kernel splitting if the user defined a `bfactor >= 5`
 	 */
 	int bfactorOneThree = ctx->device_bfactor - 4;
-	if( bfactorOneThree < 0 )
+	if(bfactorOneThree < 0)
 		bfactorOneThree = 0;
 
 	int partcountOneThree = 1 << bfactorOneThree;
 
-	for ( int i = 0; i < partcountOneThree; i++ )
+	for(int i = 0; i < partcountOneThree; i++)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>(
-			ITERATIONS,
-			MEM,
-			ctx->device_blocks*ctx->device_threads,
-			bfactorOneThree, i,
-			ctx->d_long_state,
-			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
-			ctx->d_ctx_key1 ));
-
-		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<<grid, block8>>>(
+											  ITERATIONS,
+											  MEM,
+											  ctx->device_blocks * ctx->device_threads,
+											  bfactorOneThree, i,
+											  ctx->d_long_state,
+											  (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
+											  ctx->d_ctx_key1));
+
+		if(partcount > 1 && ctx->device_bsleep > 0)
+			compat_usleep(ctx->device_bsleep);
 	}
-	if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+	if(partcount > 1 && ctx->device_bsleep > 0)
+		compat_usleep(ctx->device_bsleep);
 
-	for ( int i = 0; i < partcount; i++ )
+	for(int i = 0; i < partcount; i++)
 	{
 		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 		{
@@ -856,12 +828,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 					block2,
 					sizeof(uint64_t) * block.x * 8 +
 						// shuffle memory for fermi gpus
-						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-				>>>(
+						block2.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
 					ITERATIONS,
 					MEM,
 					MASK,
-					ctx->device_blocks*ctx->device_threads,
+					ctx->device_blocks * ctx->device_threads,
 					ctx->device_bfactor,
 					i,
 					ctx->d_long_state,
@@ -869,28 +840,24 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 					ctx->d_ctx_b,
 					ctx->d_ctx_state,
 					nonce,
-					ctx->d_input
-				)
-			);
+					ctx->d_input));
 		}
 		else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r)
 		{
-			int numThreads = ctx->device_blocks*ctx->device_threads;
+			int numThreads = ctx->device_blocks * ctx->device_threads;
 			void* args[] = {
 				&ITERATIONS, &MEM, &MASK,
 				&numThreads, &ctx->device_bfactor, &i,
-				&ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input
-			};
+				&ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input};
 			CU_CHECK(ctx->device_id, cuLaunchKernel(
-				ctx->kernel,
-				grid.x, grid.y, grid.z,
-				block2.x, block2.y, block2.z,
-				sizeof(uint64_t) * block.x * 8 +
-						// shuffle memory for fermi gpus
-						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ),
-				nullptr,
-				args, 0
-			));
+										 ctx->kernel,
+										 grid.x, grid.y, grid.z,
+										 block2.x, block2.y, block2.z,
+										 sizeof(uint64_t) * block.x * 8 +
+											 // shuffle memory for fermi gpus
+											 block2.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3),
+										 nullptr,
+										 args, 0));
 			CU_CHECK(ctx->device_id, cuCtxSynchronize());
 		}
 		else
@@ -901,12 +868,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 				cryptonight_core_gpu_phase2_quad<ALGO><<<
 					grid,
 					block4,
-					block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-				>>>(
+					block4.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
 					ITERATIONS,
 					MEM,
 					MASK,
-					ctx->device_blocks*ctx->device_threads,
+					ctx->device_blocks * ctx->device_threads,
 					ctx->device_bfactor,
 					i,
 					ctx->d_long_state,
@@ -914,57 +880,61 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 					ctx->d_ctx_b,
 					ctx->d_ctx_state,
 					nonce,
-					ctx->d_input
-				)
-			);
+					ctx->d_input));
 		}
 
-		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+		if(partcount > 1 && ctx->device_bsleep > 0)
+			compat_usleep(ctx->device_bsleep);
 	}
 
 	int roundsPhase3 = partcountOneThree;
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		// cryptonight_heavy used two full rounds over the scratchpad memory
 		roundsPhase3 *= 2;
 	}
 
-	for ( int i = 0; i < roundsPhase3; i++ )
+	int blockSizePhase3 = block8.x;
+	int gridSizePhase3 = grid.x;
+	if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock)
+	{
+		blockSizePhase3 *= 2;
+		gridSizePhase3 = (gridSizePhase3 + 1) / 2;
+	}
+	for(int i = 0; i < roundsPhase3; i++)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
-			grid,
-			block8,
-			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-		>>>(
-			ITERATIONS,
-			MEM,
-			ctx->device_blocks*ctx->device_threads,
-			bfactorOneThree, i,
-			ctx->d_long_state,
-			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+											  gridSizePhase3,
+											  blockSizePhase3,
+											  blockSizePhase3 * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
+											  ITERATIONS,
+											  MEM,
+											  ctx->device_blocks * ctx->device_threads,
+											  bfactorOneThree, i,
+											  ctx->d_long_state,
+											  ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 }
 
-template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+template <xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
 {
 	const uint32_t MASK = algo.Mask();
 	const uint32_t ITERATIONS = algo.Iter();
 	const size_t MEM = algo.Mem();
 
-	dim3 grid( ctx->device_blocks );
-	dim3 block( ctx->device_threads );
-	dim3 block2( ctx->device_threads << 1 );
-	dim3 block4( ctx->device_threads << 2 );
-	dim3 block8( ctx->device_threads << 3 );
+	dim3 grid(ctx->device_blocks);
+	dim3 block(ctx->device_threads);
+	dim3 block2(ctx->device_threads << 1);
+	dim3 block4(ctx->device_threads << 2);
+	dim3 block8(ctx->device_threads << 3);
 
 	size_t intensity = ctx->device_blocks * ctx->device_threads;
 
 	CUDA_CHECK_KERNEL(
 		ctx->device_id,
-		xmrstak::nvidia::cn_explode_gpu<<<intensity,32>>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
-	);
+		xmrstak::nvidia::cn_explode_gpu<<<intensity, 128>>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state));
 
 	int partcount = 1 << ctx->device_bfactor;
 	for(int i = 0; i < partcount; i++)
@@ -972,54 +942,57 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_
 		CUDA_CHECK_KERNEL(
 			ctx->device_id,
 			// 36 x 16byte x numThreads
-			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu
-				<<<ctx->device_blocks, ctx->device_threads * 16,  32 * 16 * ctx->device_threads>>>
-				(
-					ITERATIONS,
-					MEM,
-					MASK,
-					(int*)ctx->d_ctx_state,
-					(int*)ctx->d_long_state,
-					ctx->device_bfactor,
-					i,
-					ctx->d_ctx_a,
-					ctx->d_ctx_b
-				)
-		);
+			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu<<<ctx->device_blocks, ctx->device_threads * 16, 33 * 16 * ctx->device_threads>>>(
+				ITERATIONS,
+				MEM,
+				MASK,
+				(int*)ctx->d_ctx_state,
+				(int*)ctx->d_long_state,
+				ctx->device_bfactor,
+				i,
+				ctx->d_ctx_a,
+				ctx->d_ctx_b));
 	}
 
 	/* bfactor for phase 3
 	 *
 	 * 3 consume less time than phase 2, therefore we begin with the
-	 * kernel splitting if the user defined a `bfactor >= 5`
+	 * kernel splitting if the user defined a `bfactor >= 8`
 	 */
-	int bfactorOneThree = ctx->device_bfactor - 4;
-	if( bfactorOneThree < 0 )
+	int bfactorOneThree = ctx->device_bfactor - 8;
+	if(bfactorOneThree < 0)
 		bfactorOneThree = 0;
 
 	int partcountOneThree = 1 << bfactorOneThree;
 	int roundsPhase3 = partcountOneThree;
 
 	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
-		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		// cryptonight_heavy used two full rounds over the scratchpad memory
 		roundsPhase3 *= 2;
 	}
 
-	for ( int i = 0; i < roundsPhase3; i++ )
+	int blockSizePhase3 = block8.x;
+	int gridSizePhase3 = grid.x;
+	if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock)
+	{
+		blockSizePhase3 *= 2;
+		gridSizePhase3 = (gridSizePhase3 + 1) / 2;
+	}
+
+	for(int i = 0; i < roundsPhase3; i++)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
-			grid,
-			block8,
-			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-		>>>(
-			ITERATIONS,
-			MEM/4,
-			ctx->device_blocks*ctx->device_threads,
-			bfactorOneThree, i,
-			ctx->d_long_state,
-			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+											  gridSizePhase3,
+											  blockSizePhase3,
+											  blockSizePhase3 * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
+											  ITERATIONS,
+											  MEM / 4,
+											  ctx->device_blocks * ctx->device_threads,
+											  bfactorOneThree, i,
+											  ctx->d_long_state,
+											  ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 }
 
@@ -1030,7 +1003,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 	{
 		if(ctx->kernel_height != chain_height || ctx->cached_algo != miner_algo)
 		{
-			 if(ctx->module)
+			if(ctx->module)
 				cuModuleUnload(ctx->module);
 
 			uint32_t PRECOMPILATION_DEPTH = 4;
@@ -1045,15 +1018,16 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 			ctx->kernel_height = chain_height;
 			ctx->cached_algo = miner_algo;
 
-			for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
+			for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
 				xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo,
 					chain_height + i, PRECOMPILATION_DEPTH, ctx->device_arch[0], ctx->device_arch[1], true);
 		}
 	}
 
-	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo);
+	typedef void (*cuda_hash_fn)(nvid_ctx * ctx, uint32_t nonce, const xmrstak_algo& algo);
 
-	if(miner_algo == invalid_algo) return;
+	if(miner_algo == invalid_algo)
+		return;
 
 	static const cuda_hash_fn func_table[] = {
 		cryptonight_core_gpu_hash<cryptonight, 0>,
@@ -1105,13 +1079,11 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 		cryptonight_core_gpu_hash<cryptonight_r, 1>,
 
 		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 0>,
-		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 1>
-	};
+		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 1>};
 
 	std::bitset<1> digit;
 	digit.set(0, ctx->memMode == 1);
 
-	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
+	cuda_hash_fn selected_function = func_table[((miner_algo - 1u) << 1) | digit.to_ulong()];
 	selected_function(ctx, startNonce, miner_algo);
-
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
index fee7e13d1..516d4ca00 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -1,11 +1,63 @@
 #pragma once
 
+#include <cstdint>
 #include <cuda_runtime.h>
 #include <stdio.h>
-#include <cstdint>
 
-#include "cuda_keccak.hpp"
 #include "cuda_extra.hpp"
+#include "cuda_keccak.hpp"
+
+template <typename T>
+__device__ __forceinline__ T loadGlobal64(T* const addr)
+{
+#if(__CUDA_ARCH__ < 700)
+	T x;
+	asm volatile("ld.global.cg.u64 %0, [%1];"
+				 : "=l"(x)
+				 : "l"(addr));
+	return x;
+#else
+	return *addr;
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T loadGlobal32(T* const addr)
+{
+#if(__CUDA_ARCH__ < 700)
+	T x;
+	asm volatile("ld.global.cg.u32 %0, [%1];"
+				 : "=r"(x)
+				 : "l"(addr));
+	return x;
+#else
+	return *addr;
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ void storeGlobal32(T* addr, T const& val)
+{
+#if(__CUDA_ARCH__ < 700)
+	asm volatile("st.global.cg.u32 [%0], %1;"
+				 :
+				 : "l"(addr), "r"(val));
+#else
+	*addr = val;
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ void storeGlobal64(T* addr, T const& val)
+{
+#if(__CUDA_ARCH__ < 700)
+	asm volatile("st.global.cg.u64 [%0], %1;"
+				 :
+				 : "l"(addr), "l"(val));
+#else
+	*addr = val;
+#endif
+}
 
 namespace xmrstak
 {
@@ -15,7 +67,7 @@ namespace nvidia
 struct __m128i : public int4
 {
 
-	__forceinline__ __device__ __m128i(){}
+	__forceinline__ __device__ __m128i() {}
 
 	__forceinline__ __device__ __m128i(
 		const uint32_t x0, const uint32_t x1,
@@ -27,7 +79,7 @@ struct __m128i : public int4
 		w = x3;
 	}
 
-	__forceinline__ __device__ __m128i( const int x0)
+	__forceinline__ __device__ __m128i(const int x0)
 	{
 		x = x0;
 		y = x0;
@@ -41,8 +93,7 @@ struct __m128i : public int4
 			x | other.x,
 			y | other.y,
 			z | other.z,
-			w | other.w
-		);
+			w | other.w);
 	}
 
 	__forceinline__ __device__ __m128i operator^(const __m128i& other)
@@ -51,15 +102,14 @@ struct __m128i : public int4
 			x ^ other.x,
 			y ^ other.y,
 			z ^ other.z,
-			w ^ other.w
-		);
+			w ^ other.w);
 	}
 };
 
 struct __m128 : public float4
 {
 
-	__forceinline__ __device__ __m128(){}
+	__forceinline__ __device__ __m128() {}
 
 	__forceinline__ __device__ __m128(
 		const float x0, const float x1,
@@ -71,7 +121,7 @@ struct __m128 : public float4
 		float4::w = x3;
 	}
 
-	__forceinline__ __device__ __m128( const float x0)
+	__forceinline__ __device__ __m128(const float x0)
 	{
 		float4::x = x0;
 		float4::y = x0;
@@ -79,7 +129,7 @@ struct __m128 : public float4
 		float4::w = x0;
 	}
 
-	__forceinline__ __device__ __m128( const __m128i& x0)
+	__forceinline__ __device__ __m128(const __m128i& x0)
 	{
 		float4::x = int2float(x0.x);
 		float4::y = int2float(x0.y);
@@ -87,14 +137,13 @@ struct __m128 : public float4
 		float4::w = int2float(x0.w);
 	}
 
-	__forceinline__ __device__ __m128i get_int( )
+	__forceinline__ __device__ __m128i get_int()
 	{
 		return __m128i(
 			(int)x,
 			(int)y,
 			(int)z,
-			(int)w
-		);
+			(int)w);
 	}
 
 	__forceinline__ __device__ __m128 operator+(const __m128& other)
@@ -103,8 +152,7 @@ struct __m128 : public float4
 			x + other.x,
 			y + other.y,
 			z + other.z,
-			w + other.w
-		);
+			w + other.w);
 	}
 
 	__forceinline__ __device__ __m128 operator-(const __m128& other)
@@ -113,8 +161,7 @@ struct __m128 : public float4
 			x - other.x,
 			y - other.y,
 			z - other.z,
-			w - other.w
-		);
+			w - other.w);
 	}
 
 	__forceinline__ __device__ __m128 operator*(const __m128& other)
@@ -123,8 +170,7 @@ struct __m128 : public float4
 			x * other.x,
 			y * other.y,
 			z * other.z,
-			w * other.w
-		);
+			w * other.w);
 	}
 
 	__forceinline__ __device__ __m128 operator/(const __m128& other)
@@ -133,67 +179,64 @@ struct __m128 : public float4
 			x / other.x,
 			y / other.y,
 			z / other.z,
-			w / other.w
-		);
+			w / other.w);
 	}
 
 	__forceinline__ __device__ __m128& trunc()
 	{
-		x=::truncf(x);
-		y=::truncf(y);
-		z=::truncf(z);
-		w=::truncf(w);
+		x = ::truncf(x);
+		y = ::truncf(y);
+		z = ::truncf(z);
+		w = ::truncf(w);
 
 		return *this;
 	}
 
 	__forceinline__ __device__ __m128& abs()
 	{
-		x=::fabsf(x);
-		y=::fabsf(y);
-		z=::fabsf(z);
-		w=::fabsf(w);
+		x = ::fabsf(x);
+		y = ::fabsf(y);
+		z = ::fabsf(z);
+		w = ::fabsf(w);
 
 		return *this;
 	}
 
 	__forceinline__ __device__ __m128& floor()
 	{
-		x=::floorf(x);
-		y=::floorf(y);
-		z=::floorf(z);
-		w=::floorf(w);
+		x = ::floorf(x);
+		y = ::floorf(y);
+		z = ::floorf(z);
+		w = ::floorf(w);
 
 		return *this;
 	}
 };
 
-
-template<typename T>
+template <typename T>
 __device__ void print(const char* name, T value)
 {
 	printf("g %s: ", name);
 	for(int i = 0; i < 4; ++i)
 	{
-		printf("%08X ",((uint32_t*)&value)[i]);
+		printf("%08X ", ((uint32_t*)&value)[i]);
 	}
 	printf("\n");
 }
 
-template<>
+template <>
 __device__ void print<__m128>(const char* name, __m128 value)
 {
 	printf("g %s: ", name);
 	for(int i = 0; i < 4; ++i)
 	{
-		printf("%f ",((float*)&value)[i]);
+		printf("%f ", ((float*)&value)[i]);
 	}
 	printf("\n");
 }
 
 #define SHOW(name) print(#name, name)
 
-
 __forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b)
 {
 	return a + b;
@@ -220,8 +263,7 @@ __forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b)
 		int_as_float(float_as_int(a.x) & b),
 		int_as_float(float_as_int(a.y) & b),
 		int_as_float(float_as_int(a.z) & b),
-		int_as_float(float_as_int(a.w) & b)
-	);
+		int_as_float(float_as_int(a.w) & b));
 }
 
 __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
@@ -230,8 +272,7 @@ __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
 		int_as_float(float_as_int(a.x) | b),
 		int_as_float(float_as_int(a.y) | b),
 		int_as_float(float_as_int(a.z) | b),
-		int_as_float(float_as_int(a.w) | b)
-	);
+		int_as_float(float_as_int(a.w) | b));
 }
 
 __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
@@ -240,20 +281,18 @@ __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
 		int_as_float(float_as_int(a.x) ^ b),
 		int_as_float(float_as_int(a.y) ^ b),
 		int_as_float(float_as_int(a.z) ^ b),
-		int_as_float(float_as_int(a.w) ^ b)
-	);
+		int_as_float(float_as_int(a.w) ^ b));
 }
 
 __forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc)
 {
 	__m128 d(dc);
 	__m128 c = _mm_div_ps(v, d);
-	c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+	c.trunc(); //_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
 	// c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2
 	c = _mm_mul_ps(c, d);
 	return _mm_sub_ps(v, c);
 
-
 	//return a.fmodf(b);
 }
 
@@ -262,23 +301,20 @@ __forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b)
 	return a ^ b;
 }
 
-
 __forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot)
 {
 	const uint32_t right = 8 * rot;
 	const uint32_t left = (32 - 8 * rot);
 	return __m128i(
-		((uint32_t)a.x >> right) | ( a.y << left ),
-		((uint32_t)a.y >> right) | ( a.z << left ),
-		((uint32_t)a.z >> right) | ( a.w << left ),
-		((uint32_t)a.w >> right) | ( a.x << left )
-	);
+		((uint32_t)a.x >> right) | (a.y << left),
+		((uint32_t)a.y >> right) | (a.z << left),
+		((uint32_t)a.z >> right) | (a.w << left),
+		((uint32_t)a.w >> right) | (a.x << left));
 }
 
-__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
+__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int* lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
 
-
-__forceinline__ __device__  __m128 fma_break(__m128 x)
+__forceinline__ __device__ __m128 fma_break(__m128 x)
 {
 	// Break the dependency chain by setitng the exp to ?????01
 	x = _mm_and_ps(x, 0xFEFFFFFF);
@@ -290,13 +326,13 @@ __forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m12
 {
 	n1 = _mm_add_ps(n1, c);
 	__m128 nn = _mm_mul_ps(n0, c);
-	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn));
 	nn = fma_break(nn);
 	n = _mm_add_ps(n, nn);
 
 	n3 = _mm_sub_ps(n3, c);
 	__m128 dd = _mm_mul_ps(n2, c);
-	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd));
 	dd = fma_break(dd);
 	d = _mm_add_ps(d, dd);
 
@@ -326,7 +362,7 @@ __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, _
 	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
 	d = _mm_and_ps(d, 0xFF7FFFFF);
 	d = _mm_or_ps(d, 0x40000000);
-	r =_mm_add_ps(r, _mm_div_ps(n,d));
+	r = _mm_add_ps(r, _mm_div_ps(n, d));
 }
 
 // 74*8 = 595
@@ -335,15 +371,14 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n
 	__m128 c(cnt);
 	// 35 maths calls follow (140 FLOPS)
 	__m128 r = __m128(0.0f);
-	for(int i=0; i< 4; ++i)
+	for(int i = 0; i < 4; ++i)
 		round_compute(n0, n1, n2, n3, rnd_c, c, r);
 	// do a quick fmod by setting exp to 2
 	r = _mm_and_ps(r, 0x807FFFFF);
 	r = _mm_or_ps(r, 0x40000000);
-	sum = r; // 34
+	sum = r;								 // 34
 	r = _mm_mul_ps(r, __m128(536870880.0f)); // 35
 	return r.get_int();
-
 }
 
 __forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
@@ -376,8 +411,7 @@ __constant__ uint32_t look[16][4] = {
 	{3, 1, 2, 0},
 	{3, 2, 0, 1},
 	{3, 0, 1, 2},
-	{3, 0, 2, 1}
-};
+	{3, 0, 2, 1}};
 
 __constant__ float ccnt[16] = {
 	1.34375f,
@@ -398,31 +432,30 @@ __constant__ float ccnt[16] = {
 	1.3203125f,
 	1.3515625f,
 	1.3359375f,
-	1.4609375f
-};
-
+	1.4609375f};
 
 __forceinline__ __device__ void sync()
 {
-#if (__CUDACC_VER_MAJOR__ >= 9)
+#if(__CUDACC_VER_MAJOR__ >= 9)
 	__syncwarp();
 #else
-	__syncthreads( );
+	__syncthreads();
 #endif
 }
 
 struct SharedMemChunk
 {
 	__m128i out[16];
-	__m128 va[16];
+	__m128 va[17];
 };
 
+__launch_bounds__(128, 8)
 __global__ void cryptonight_core_gpu_phase2_gpu(
-	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
-	int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
+	const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK,
+	int32_t* spad, int* lpad_in, int bfactor, int partidx, uint32_t* roundVs, uint32_t* roundS)
 {
 
-	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+	const int batchsize = (ITERATIONS * 2) >> (1 + bfactor);
 
 	extern __shared__ SharedMemChunk smemExtern_in[];
 
@@ -435,7 +468,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 
 	uint32_t tid = threadIdx.x % 16;
 
-	const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16;
+	const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x / 16;
 	uint32_t s = 0;
 
 	__m128 vs(0);
@@ -455,10 +488,10 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 	const uint32_t tidm = tid % 4;
 	const uint32_t block = tidd * 16 + tidm;
 
-	for(size_t i = 0; i < batchsize; i++)
+	for(int i = 0; i < batchsize; i++)
 	{
 		sync();
-		int tmp = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm];
+		int tmp = loadGlobal32<int>( ((int*)scratchpad_ptr(s, tidd, lpad, MASK)) + tidm );
 		((int*)smem->out)[tid] = tmp;
 		sync();
 
@@ -470,8 +503,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 			*(smem->out + look[tid][2]),
 			*(smem->out + look[tid][3]),
 			ccnt[tid], rc, smem->va[tid],
-			smem->out[tid]
-		);
+			smem->out[tid]);
 
 		sync();
 
@@ -479,11 +511,11 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 		for(uint32_t dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
 			outXor ^= ((int*)smem->out)[dd];
 
-		((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ tmp;
+		storeGlobal32( ((int*)scratchpad_ptr(s, tidd, lpad, MASK)) + tidm, outXor ^ tmp );
 		((int*)smem->out)[tid] = outXor;
 
 		float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
-		float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12];
+		float va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12];
 		((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
 
 		sync();
@@ -505,10 +537,10 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 		vs = _mm_div_ps(vs, __m128(64.0f));
 		s = out2.x ^ out2.y ^ out2.z ^ out2.w;
 	}
-	if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0)
+	if(partidx != ((1 << bfactor) - 1) && threadIdx.x % 16 == 0)
 	{
 		const uint32_t numHashPerBlock2 = blockDim.x / 16;
-		const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x/16;
+		const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x / 16;
 		((__m128*)roundVs)[idxHash2] = vs;
 		roundS[idxHash2] = s;
 	}
@@ -519,30 +551,29 @@ __forceinline__ __device__ void generate_512(uint64_t idx, const uint64_t* in, u
 	uint64_t hash[25];
 
 	hash[0] = in[0] ^ idx;
-	#pragma unroll 24
+#pragma unroll 24
 	for(int i = 1; i < 25; ++i)
 		hash[i] = in[i];
 
 	cn_keccakf2(hash);
-	#pragma unroll 10
+#pragma unroll 10
 	for(int i = 0; i < 10; ++i)
 		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
-	out+=160;
+	out += 160;
 
 	cn_keccakf2(hash);
-	#pragma unroll 11
+#pragma unroll 11
 	for(int i = 0; i < 11; ++i)
 		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
-	out+=176;
+	out += 176;
 
 	cn_keccakf2(hash);
-	#pragma unroll 11
+#pragma unroll 11
 	for(int i = 0; i < 11; ++i)
 		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
 }
 
-
-__global__ void cn_explode_gpu(const size_t MEMORY, int32_t *spad_in, int *lpad_in)
+__global__ void cn_explode_gpu(const size_t MEMORY, int32_t* spad_in, int* lpad_in)
 {
 	__shared__ uint64_t state[25];
 
@@ -550,15 +581,18 @@ __global__ void cn_explode_gpu(const size_t MEMORY, int32_t *spad_in, int *lpad_
 	uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200);
 
 	for(int i = threadIdx.x; i < 25; i += blockDim.x)
-		state[i] = spad[i];
+		state[i] = loadGlobal64<uint64_t>(spad + i);
 
-	sync();
+	if(blockDim.x > 32)
+		__syncthreads();
+	else
+		sync();
 
-	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x)
+	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i += blockDim.x)
 	{
-		generate_512(i, state, (uint8_t*)lpad + i*512);
+		generate_512(i, state, (uint8_t*)lpad + i * 512);
 	}
 }
 
-} // namespace xmrstak
 } // namespace nvidia
+} // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
index bcf495080..214114c7e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
@@ -462,10 +462,10 @@ __global__ void CryptonightR_phase2(
     uint64_t bx0             = ((uint64_t*)(d_ctx_b + thread * 16))[sub];
     uint64_t bx1             = ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub];
 
-    uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2];
-    uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1];
-    uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2];
-    uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3];
+    volatile uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2];
+    volatile uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1];
+    volatile uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2];
+    volatile uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3];
 
     const int batchsize      = (ITERATIONS * 2) >> ( 1 + bfactor );
     const int start          = partidx * batchsize;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
index 96cb679f5..48ebe4bd7 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
@@ -2,8 +2,8 @@
 #pragma once
 
 #include <cuda_runtime.h>
-#include <stdexcept>
 #include <iostream>
+#include <stdexcept>
 #include <string>
 
 /** execute and check a CUDA api command
@@ -12,27 +12,30 @@
  * @param msg message string which should be added to the error message
  * @param ... CUDA api command
  */
-#define CUDA_CHECK_MSG(id, msg, ...) { \
-	cudaError_t error = __VA_ARGS__; \
-	if(error!=cudaSuccess){	\
-		std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \
-		std::cerr << msg << std::endl;                                         \
-		throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \
-	} \
-} \
-( (void) 0 )
-
-#define CU_CHECK(id, ...) {                                                                             \
-    CUresult result = __VA_ARGS__;                                                                      \
-    if(result != CUDA_SUCCESS){                                                                         \
-        const char* s;                                                                                  \
-        cuGetErrorString(result, &s);                                                                   \
-        std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \
-        throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \
-    }                                                                                                   \
-}                                                                                                       \
-( (void) 0 )
+#define CUDA_CHECK_MSG(id, msg, ...)                                                                          \
+	{                                                                                                         \
+		cudaError_t error = __VA_ARGS__;                                                                      \
+		if(error != cudaSuccess)                                                                              \
+		{                                                                                                     \
+			std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__;                  \
+			std::cerr << msg << std::endl;                                                                    \
+			throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \
+		}                                                                                                     \
+	}                                                                                                         \
+	((void)0)
 
+#define CU_CHECK(id, ...)                                                                                                                                   \
+	{                                                                                                                                                       \
+		CUresult result = __VA_ARGS__;                                                                                                                      \
+		if(result != CUDA_SUCCESS)                                                                                                                          \
+		{                                                                                                                                                   \
+			const char* s;                                                                                                                                  \
+			cuGetErrorString(result, &s);                                                                                                                   \
+			std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \
+			throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error"));                                                 \
+		}                                                                                                                                                   \
+	}                                                                                                                                                       \
+	((void)0)
 
 /** execute and check a CUDA api command
  *
@@ -47,7 +50,7 @@
  * @param ... CUDA kernel call
  */
 #define CUDA_CHECK_KERNEL(id, ...) \
-	__VA_ARGS__; \
+	__VA_ARGS__;                   \
 	CUDA_CHECK(id, cudaGetLastError())
 
 /** execute and check a CUDA kernel
@@ -57,5 +60,5 @@
  * @param ... CUDA kernel call
  */
 #define CUDA_CHECK_MSG_KERNEL(id, msg, ...) \
-	__VA_ARGS__; \
+	__VA_ARGS__;                            \
 	CUDA_CHECK_MSG(id, msg, cudaGetLastError())
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index b6e41c619..d5b292cb4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -1,83 +1,80 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <sstream>
+#include "xmrstak/jconf.hpp"
 #include <algorithm>
-#include <vector>
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include  <algorithm>
-#include "xmrstak/jconf.hpp"
-
+#include <sstream>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <vector>
 
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
 
-#include "xmrstak/backend/cryptonight.hpp"
 #include "cryptonight.hpp"
-#include "cuda_extra.hpp"
-#include "cuda_keccak.hpp"
+#include "cuda_aes.hpp"
 #include "cuda_blake.hpp"
+#include "cuda_device.hpp"
+#include "cuda_extra.hpp"
 #include "cuda_groestl.hpp"
 #include "cuda_jh.hpp"
+#include "cuda_keccak.hpp"
 #include "cuda_skein.hpp"
-#include "cuda_device.hpp"
-#include "cuda_aes.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 
-__constant__ uint8_t d_sub_byte[16][16] ={
-	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
-	{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
-	{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 },
-	{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 },
-	{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 },
-	{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf },
-	{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 },
-	{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 },
-	{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 },
-	{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb },
-	{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 },
-	{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 },
-	{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a },
-	{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e },
-	{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf },
-	{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }
-};
-
-__device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ key, const uint32_t * __restrict__ data )
+__constant__ uint8_t d_sub_byte[16][16] = {
+	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76},
+	{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0},
+	{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15},
+	{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75},
+	{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84},
+	{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf},
+	{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8},
+	{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2},
+	{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73},
+	{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb},
+	{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79},
+	{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08},
+	{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a},
+	{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e},
+	{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf},
+	{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}};
+
+__device__ __forceinline__ void cryptonight_aes_set_key(uint32_t* __restrict__ key, const uint32_t* __restrict__ data)
 {
 	int i, j;
 	uint8_t temp[4];
-	const uint32_t aes_gf[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
+	const uint32_t aes_gf[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36};
 
-	MEMSET4( key, 0, 40 );
-	MEMCPY4( key, data, 8 );
+	MEMSET4(key, 0, 40);
+	MEMCPY4(key, data, 8);
 
 #pragma unroll
-	for ( i = 8; i < 40; i++ )
+	for(i = 8; i < 40; i++)
 	{
-		*(uint32_t *) temp = key[i - 1];
-		if ( i % 8 == 0 )
+		*(uint32_t*)temp = key[i - 1];
+		if(i % 8 == 0)
 		{
-			*(uint32_t *) temp = ROTR32( *(uint32_t *) temp, 8 );
-			for ( j = 0; j < 4; j++ )
-				temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
-			*(uint32_t *) temp ^= aes_gf[i / 8 - 1];
+			*(uint32_t*)temp = ROTR32(*(uint32_t*)temp, 8);
+			for(j = 0; j < 4; j++)
+				temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
+			*(uint32_t*)temp ^= aes_gf[i / 8 - 1];
 		}
 		else
 		{
-			if ( i % 8 == 4 )
+			if(i % 8 == 4)
 			{
 #pragma unroll
-				for ( j = 0; j < 4; j++ )
-					temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
+				for(j = 0; j < 4; j++)
+					temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
 			}
 		}
 
-		key[i] = key[( i - 8 )] ^ *(uint32_t *) temp;
+		key[i] = key[(i - 8)] ^ *(uint32_t*)temp;
 	}
 }
 
-__device__ __forceinline__ void mix_and_propagate( uint32_t* state )
+__device__ __forceinline__ void mix_and_propagate(uint32_t* state)
 {
 	uint32_t tmp0[4];
 	for(size_t x = 0; x < 4; ++x)
@@ -93,18 +90,18 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state )
 		(state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x];
 }
 
-template<xmrstak_algo_id ALGO>
-__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
+template <xmrstak_algo_id ALGO>
+__global__ void cryptonight_extra_gpu_prepare(int threads, uint32_t* __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_state2, uint32_t* __restrict__ d_ctx_a, uint32_t* __restrict__ d_ctx_b, uint32_t* __restrict__ d_ctx_key1, uint32_t* __restrict__ d_ctx_key2)
 {
-	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	__shared__ uint32_t sharedMemory[1024];
 
 	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
-		cn_aes_gpu_init( sharedMemory );
-		__syncthreads( );
+		cn_aes_gpu_init(sharedMemory);
+		__syncthreads();
 	}
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	uint32_t ctx_state[50];
@@ -114,29 +111,29 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	uint32_t ctx_key2[40];
 	uint32_t input[32];
 
-	memcpy( input, d_input, len );
+	memcpy(input, d_input, len);
 	//*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
 	uint32_t nonce = startNonce + thread;
-	for ( int i = 0; i < sizeof (uint32_t ); ++i )
-		( ( (char *) input ) + 39 )[i] = ( (char*) ( &nonce ) )[i]; //take care of pointer alignment
+	for(int i = 0; i < sizeof(uint32_t); ++i)
+		(((char*)input) + 39)[i] = ((char*)(&nonce))[i]; //take care of pointer alignment
 
-	cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state );
-	cryptonight_aes_set_key( ctx_key1, ctx_state );
-	cryptonight_aes_set_key( ctx_key2, ctx_state + 8 );
+	cn_keccak((uint8_t*)input, len, (uint8_t*)ctx_state);
+	cryptonight_aes_set_key(ctx_key1, ctx_state);
+	cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
 
-	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
-	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
-	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
+	XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
+	XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
+	memcpy(d_ctx_a + thread * 4, ctx_a, 4 * 4);
 	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 	{
-		memcpy( d_ctx_b + thread * 16, ctx_b, 4 * 4 );
+		memcpy(d_ctx_b + thread * 16, ctx_b, 4 * 4);
 		// bx1
-		XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b );
-		memcpy( d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4 );
+		XOR_BLOCKS_DST(ctx_state + 16, ctx_state + 20, ctx_b);
+		memcpy(d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4);
 		// division_result
-		memcpy( d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2 );
+		memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2);
 		// sqrt_result
-		memcpy( d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+		memcpy(d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2);
 	}
 	else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r)
 	{
@@ -148,31 +145,31 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 		memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 8);
 	}
 	else
-		memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+		memcpy(d_ctx_b + thread * 4, ctx_b, 4 * 4);
 
-	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
-	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
-	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
+	memcpy(d_ctx_key1 + thread * 40, ctx_key1, 40 * 4);
+	memcpy(d_ctx_key2 + thread * 40, ctx_key2, 40 * 4);
+	memcpy(d_ctx_state + thread * 50, ctx_state, 50 * 4);
 
 	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 
-		for(int i=0; i < 16; i++)
+		for(int i = 0; i < 16; i++)
 		{
 			for(size_t t = 4; t < 12; ++t)
 			{
-				cn_aes_pseudo_round_mut( sharedMemory, ctx_state + 4u * t, ctx_key1 );
+				cn_aes_pseudo_round_mut(sharedMemory, ctx_state + 4u * t, ctx_key1);
 			}
 			// scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
 			mix_and_propagate(ctx_state + 4 * 4);
 		}
 		// double buffer to move manipulated state into phase1
-		memcpy( d_ctx_state2 + thread * 50, ctx_state, 50 * 4 );
+		memcpy(d_ctx_state2 + thread * 50, ctx_state, 50 * 4);
 	}
 }
 
-template<xmrstak_algo_id ALGO>
-__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
+template <xmrstak_algo_id ALGO>
+__global__ void cryptonight_extra_gpu_final(int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t* __restrict__ d_res_nonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_key2)
 {
 	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -181,19 +178,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
 		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
-		cn_aes_gpu_init( sharedMemory );
-		__syncthreads( );
+		cn_aes_gpu_init(sharedMemory);
+		__syncthreads();
 	}
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	int i;
-	uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50;
+	uint32_t* __restrict__ ctx_state = d_ctx_state + thread * 50;
 	uint64_t hash[4];
 	uint32_t state[50];
 
-	#pragma unroll
-	for ( i = 0; i < 50; i++ )
+#pragma unroll
+	for(i = 0; i < 50; i++)
 		state[i] = ctx_state[i];
 
 	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
@@ -202,25 +199,25 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 		uint32_t key[40];
 
 		// load keys
-		MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
+		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
 
-		for(int i=0; i < 16; i++)
+		for(int i = 0; i < 16; i++)
 		{
 			for(size_t t = 4; t < 12; ++t)
 			{
-				cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key );
+				cn_aes_pseudo_round_mut(sharedMemory, state + 4u * t, key);
 			}
 			// scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
 			mix_and_propagate(state + 4 * 4);
 		}
 	}
-	cn_keccakf2( (uint64_t *) state );
+	cn_keccakf2((uint64_t*)state);
 
 	if(ALGO == cryptonight_gpu)
 	{
-		if ( ((uint64_t*)state)[3] < target )
+		if(((uint64_t*)state)[3] < target)
 		{
-			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+			uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF);
 
 			if(idx < 10)
 				d_res_nonce[idx] = thread;
@@ -228,19 +225,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	}
 	else
 	{
-		switch ( ( (uint8_t *) state )[0] & 0x03 )
+		switch(((uint8_t*)state)[0] & 0x03)
 		{
 		case 0:
-			cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
+			cn_blake((const uint8_t*)state, 200, (uint8_t*)hash);
 			break;
 		case 1:
-			cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
+			cn_groestl((const BitSequence*)state, 200, (BitSequence*)hash);
 			break;
 		case 2:
-			cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
+			cn_jh((const BitSequence*)state, 200, (BitSequence*)hash);
 			break;
 		case 3:
-			cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
+			cn_skein((const BitSequence*)state, 200, (BitSequence*)hash);
 			break;
 		default:
 			break;
@@ -249,9 +246,9 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
 		// and expect an accurate result for target > 32-bit without implementing carries
 
-		if ( hash[3] < target )
+		if(hash[3] < target)
 		{
-			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+			uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF);
 
 			if(idx < 10)
 				d_res_nonce[idx] = thread;
@@ -259,10 +256,10 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	}
 }
 
-extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len )
+extern "C" void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len)
 {
 	ctx->inputlen = len;
-	CUDA_CHECK(ctx->device_id, cudaMemcpy( ctx->d_input, data, len, cudaMemcpyHostToDevice ));
+	CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, data, len, cudaMemcpyHostToDevice));
 }
 
 extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
@@ -290,7 +287,6 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	case 3:
 		CUDA_CHECK(ctx->device_id, cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 		break;
-
 	};
 
 	// prefer shared memory over L1 cache
@@ -314,8 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
-		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
-	)
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end())
 	{
 		// extent ctx_b to hold the state of idx0
 		ctx_b_size += sizeof(uint32_t) * wsize;
@@ -326,16 +321,14 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	{
 		ctx_b_size += sizeof(uint32_t) * 4 * wsize;
 	}
-	else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
-		|| (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end()))
+	else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end()))
 	{
 		// bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte)
 		ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize;
 	}
 	else if(
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() ||
-		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end()
-	)
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end())
 	{
 		// bx0 (16byte), bx1 (16byte), and [r0, r1, r2, r3] (a 8byte)
 		ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize;
@@ -349,9 +342,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size));
 	// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof (uint32_t ) ));
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof(uint32_t)));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof(uint32_t)));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof(uint32_t)));
 	CUDA_CHECK_MSG(
 		ctx->device_id,
 		"\n**suggestion: Try to reduce the value of the attribute 'threads' in the NVIDIA config file.**",
@@ -364,106 +357,102 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
 
-	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
-	dim3 block( threadsperblock );
+	dim3 grid((wsize + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
 	if(miner_algo == cryptonight_heavy)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_heavy><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_heavy><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_haven)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_haven><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_haven><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_superfast)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_superfast><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_superfast><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_monero_v8)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_gpu)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_r)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_r_wow)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r_wow><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r_wow><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_v8_reversewaltz)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_v8_reversewaltz><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_v8_reversewaltz><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
 		 * the first is used than in phase3
 		 */
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<invalid_algo><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<invalid_algo><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 }
 
-extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo)
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
 
-	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
-	dim3 block( threadsperblock );
+	dim3 grid((wsize + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
-	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
-	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0xFF, 10 * sizeof(uint32_t)));
+	CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_count, 0, sizeof(uint32_t)));
 
 	if(miner_algo == cryptonight_heavy)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_heavy><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_heavy><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_haven)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_haven><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_haven><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_superfast)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_gpu)
 	{
@@ -471,8 +460,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_gpu><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_gpu><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else
 	{
@@ -480,16 +468,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<invalid_algo><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<invalid_algo><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 
-	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+	CUDA_CHECK(ctx->device_id, cudaMemcpy(rescount, ctx->d_result_count, sizeof(uint32_t), cudaMemcpyDeviceToHost));
 	CUDA_CHECK_MSG(
 		ctx->device_id,
 		"\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**",
-		cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )
-	);
+		cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 	/* There is only a 32bit limit for the counter on the device side
 	 * therefore this value can be greater than 10, in that case limit rescount
@@ -497,11 +483,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 	 */
 	if(*rescount > 10)
 		*rescount = 10;
-	for(int i=0; i < *rescount; i++)
+	for(int i = 0; i < *rescount; i++)
 		resnonce[i] += startNonce;
 }
 
-extern "C" int cuda_get_devicecount( int* deviceCount)
+extern "C" int cuda_get_devicecount(int* deviceCount)
 {
 	cudaError_t err;
 	*deviceCount = 0;
@@ -574,12 +560,13 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	ctx->device_mpcount = props.multiProcessorCount;
 	ctx->device_arch[0] = props.major;
 	ctx->device_arch[1] = props.minor;
+	ctx->device_maxThreadsPerBlock = props.maxThreadsPerBlock;
 
 	const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1];
 
 	ctx->name = std::string(props.name);
 
-	printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ",
+	printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"...\n",
 		version / 1000, (version % 1000 / 10),
 		CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10,
 		ctx->device_id, gpuArch, ctx->device_name);
@@ -587,17 +574,17 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	std::vector<int> arch;
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
-	char const * archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST);
+	char const* archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST);
 #undef XMRSTAK_PP_TOSTRING
 #undef XMRSTAK_PP_TOSTRING1
 	std::stringstream ss(archStringList);
 
 	//transform string list separated with `+` into a vector of integers
 	int tmpArch;
-	while ( ss >> tmpArch )
-		arch.push_back( tmpArch );
+	while(ss >> tmpArch)
+		arch.push_back(tmpArch);
 
-	#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n"
+#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n"
 	if(gpuArch >= 20 && gpuArch < 30)
 	{
 		// compiled binary must support sm_20 for fermi
@@ -618,7 +605,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 *   with a sm_20 only compiled binary
 		 */
 		for(int i = 0; i < arch.size(); ++i)
-			if(arch[i] >= 30  && (minSupportedArch == 0 || arch[i] < minSupportedArch))
+			if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch))
 				minSupportedArch = arch[i];
 		if(minSupportedArch < 30 || gpuArch < minSupportedArch)
 		{
@@ -630,7 +617,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
 	bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
 
-
 	// set all device option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
 	{
@@ -648,6 +634,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		if(props.multiProcessorCount <= 6)
 			ctx->device_bfactor += 2;
 	}
+
+	// for the most algorithms we are using 8 threads per hash
+	uint32_t threadsPerHash = 8;
+
 	if(ctx->device_threads == -1)
 	{
 		/* sm_20 devices can only run 512 threads per cuda block
@@ -656,9 +646,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 */
 		const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024;
 
-		// for the most algorithms we are using 8 threads per hash
-		uint32_t threadsPerHash = 8;
-
 		// phase2_gpu uses 16 threads per hash
 		if(useCryptonight_gpu)
 			threadsPerHash = 16;
@@ -700,7 +687,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 		int* tmp;
 		cudaError_t err;
-		#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n"
+#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n"
 		// a device must be selected to get the right memory usage later on
 		err = cudaSetDevice(ctx->device_id);
 		if(err != cudaSuccess)
@@ -716,7 +703,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			return 3;
 		}
 
-
 		size_t freeMemory = 0;
 		size_t totalMemory = 0;
 		CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -746,7 +732,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		size_t usedMem = totalMemory - freeMemory;
 		if(usedMem >= maxMemUsage)
 		{
-			printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str());
+			printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem / byteToMiB).c_str());
 			return 4;
 		}
 		else
@@ -764,8 +750,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
 			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
 			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
-			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
-		)
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end())
 			perThread += 50 * 4; // state double buffer
 
 		size_t max_intensity = limitedMemory / perThread;
@@ -805,22 +790,30 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			// 8 is chosen by checking the occupancy calculator
 			size_t blockOptimal = 8 * ctx->device_mpcount;
 
+			if(gpuArch == 30)
+				blockOptimal = 8 * ctx->device_mpcount;
 			// the following values are calculated with CUDA10 and the occupancy calculator
-			if(gpuArch == 35 || gpuArch/10 == 5 || gpuArch/10 == 6)
-				blockOptimal = 7 *  ctx->device_mpcount;
+			if(gpuArch == 35 || gpuArch / 10 == 5 || gpuArch / 10 == 6)
+				blockOptimal = 7 * ctx->device_mpcount;
 			if(gpuArch == 37)
-				blockOptimal = 14 *  ctx->device_mpcount;
+				blockOptimal = 14 * ctx->device_mpcount;
 			if(gpuArch >= 70)
-				blockOptimal = 6 *  ctx->device_mpcount;
+				blockOptimal = 6 * ctx->device_mpcount;
 
 			if(blockOptimal * threads * hashMemSize < limitedMemory)
-			{
-				ctx->device_threads = threads;
 				ctx->device_blocks = blockOptimal;
-			}
-
+			else
+				ctx->device_blocks = limitedMemory / hashMemSize / threads; // round to a memory fitting value
+			ctx->device_threads = threads;
 		}
 	}
+
+	if(ctx->device_threads * threadsPerHash > ctx->device_maxThreadsPerBlock)
+	{
+		// by default cryptonight CUDA implementations uses 8 threads per thread for some kernel
+		ctx->device_threads = ctx->device_maxThreadsPerBlock / threadsPerHash;
+		printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads);
+	}
 	printf("device init succeeded\n");
 
 	return 0;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
index 4d369f843..ec7e3e0a4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
@@ -2,31 +2,13 @@
 
 #include "xmrstak/backend/cryptonight.hpp"
 
-#ifdef __INTELLISENSE__
-#define __CUDA_ARCH__ 520
-/* avoid red underlining */
-
-struct uint3
-{
-	unsigned int x, y, z;
-};
-
-struct uint3  threadIdx;
-struct uint3  blockIdx;
-struct uint3  blockDim;
-#define __funnelshift_r(a,b,c) 1
-#define __syncthreads()
-#define asm(x)
-#define __shfl(a,b,c) 1
-#endif
-
-#define AES_BLOCK_SIZE  16
-#define AES_KEY_SIZE    32
-#define INIT_SIZE_BLK   8
+#define AES_BLOCK_SIZE 16
+#define AES_KEY_SIZE 32
+#define INIT_SIZE_BLK 8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
 
-#define C32(x)    ((uint32_t)(x ## U))
-#define T32(x) ((x) & C32(0xFFFFFFFF))
+#define C32(x) ((uint32_t)(x##U))
+#define T32(x) ((x)&C32(0xFFFFFFFF))
 
 #if __CUDA_ARCH__ >= 350
 __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset)
@@ -34,71 +16,112 @@ __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int
 	uint2 result;
 	if(offset >= 32)
 	{
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.x)
+			: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.y)
+			: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 	}
 	else
 	{
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.x)
+			: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.y)
+			: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 	}
-	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
 }
-#define ROTL64(x, n) (cuda_ROTL64(x, n))
+
+#	define ROTL64(x, n) (cuda_ROTL64(x, n))
 #else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+#	define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
 #endif
 
 #if __CUDA_ARCH__ < 350
 #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
 #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #else
-#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#define ROTL32(x, n) __funnelshift_l((x), (x), (n))
+#define ROTR32(x, n) __funnelshift_r((x), (x), (n))
 #endif
 
-#define MEMSET8(dst,what,cnt) { \
-	int i_memset8; \
-	uint64_t *out_memset8 = (uint64_t *)(dst); \
-	for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
-		out_memset8[i_memset8] = (what); }
-
-#define MEMSET4(dst,what,cnt) { \
-	int i_memset4; \
-	uint32_t *out_memset4 = (uint32_t *)(dst); \
-	for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
-		out_memset4[i_memset4] = (what); }
-
-#define MEMCPY8(dst,src,cnt) { \
-	int i_memcpy8; \
-	uint64_t *in_memcpy8 = (uint64_t *)(src); \
-	uint64_t *out_memcpy8 = (uint64_t *)(dst); \
-	for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
-		out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
-
-#define MEMCPY4(dst,src,cnt) { \
-	int i_memcpy4; \
-	uint32_t *in_memcpy4 = (uint32_t *)(src); \
-	uint32_t *out_memcpy4 = (uint32_t *)(dst); \
-	for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
-		out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
-
-#define XOR_BLOCKS(a,b) { \
-	((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
-	((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }
-
-#define XOR_BLOCKS_DST(x,y,z) { \
-	((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
-	((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
-
-#define MUL_SUM_XOR_DST(a,c,dst) { \
-	const uint64_t dst0 = ((uint64_t *)dst)[0]; \
-	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \
-	hi += ((uint64_t *)c)[0]; \
-	((uint64_t *)c)[0] = dst0 ^ hi; \
-	((uint64_t *)dst)[0] = hi; \
-	((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \
+#if __CUDA_ARCH__ >= 500
+#	define BYTE_0(x) __byte_perm(x, 0u, 0x4440)
+#	define BYTE_1(x) __byte_perm(x, 0u, 0x4441)
+#	define BYTE_2(x) __byte_perm(x, 0u, 0x4442)
+#	define BYTE_3(x) __byte_perm(x, 0u, 0x4443)
+
+#	define ROTL32_8(x) __byte_perm(x, x, 0x2103)
+#	define ROTL32_16(x) __byte_perm(x, x, 0x1032)
+#	define ROTL32_24(x) __byte_perm(x, x, 0x0321)
+#else
+#	define BYTE_0(x) (((x)      ) & 0xff)
+#	define BYTE_1(x) (((x) >>  8) & 0xff)
+#	define BYTE_2(x) (((x) >> 16) & 0xff)
+#	define BYTE_3(x) (((x) >> 24))
+
+#	define ROTL32_8(x)  ROTL32(x, 8)
+#	define ROTL32_16(x) ROTL32(x, 16)
+#	define ROTL32_24(x) ROTL32(x, 24)
+#endif
+
+#define MEMSET8(dst, what, cnt)                          \
+	{                                                    \
+		int i_memset8;                                   \
+		uint64_t* out_memset8 = (uint64_t*)(dst);        \
+		for(i_memset8 = 0; i_memset8 < cnt; i_memset8++) \
+			out_memset8[i_memset8] = (what);             \
 	}
 
-#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
+#define MEMSET4(dst, what, cnt)                          \
+	{                                                    \
+		int i_memset4;                                   \
+		uint32_t* out_memset4 = (uint32_t*)(dst);        \
+		for(i_memset4 = 0; i_memset4 < cnt; i_memset4++) \
+			out_memset4[i_memset4] = (what);             \
+	}
 
+#define MEMCPY8(dst, src, cnt)                              \
+	{                                                       \
+		int i_memcpy8;                                      \
+		uint64_t* in_memcpy8 = (uint64_t*)(src);            \
+		uint64_t* out_memcpy8 = (uint64_t*)(dst);           \
+		for(i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++)    \
+			out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; \
+	}
+
+#define MEMCPY4(dst, src, cnt)                              \
+	{                                                       \
+		int i_memcpy4;                                      \
+		uint32_t* in_memcpy4 = (uint32_t*)(src);            \
+		uint32_t* out_memcpy4 = (uint32_t*)(dst);           \
+		for(i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++)    \
+			out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; \
+	}
+
+#define XOR_BLOCKS(a, b)                        \
+	{                                           \
+		((uint64_t*)a)[0] ^= ((uint64_t*)b)[0]; \
+		((uint64_t*)a)[1] ^= ((uint64_t*)b)[1]; \
+	}
+
+#define XOR_BLOCKS_DST(x, y, z)                                        \
+	{                                                                  \
+		((uint64_t*)z)[0] = ((uint64_t*)(x))[0] ^ ((uint64_t*)(y))[0]; \
+		((uint64_t*)z)[1] = ((uint64_t*)(x))[1] ^ ((uint64_t*)(y))[1]; \
+	}
+
+#define MUL_SUM_XOR_DST(a, c, dst)                                                                           \
+	{                                                                                                        \
+		const uint64_t dst0 = ((uint64_t*)dst)[0];                                                           \
+		uint64_t hi, lo = cuda_mul128(((uint64_t*)a)[0], dst0, &hi) + ((uint64_t*)c)[1];                     \
+		hi += ((uint64_t*)c)[0];                                                                             \
+		((uint64_t*)c)[0] = dst0 ^ hi;                                                                       \
+		((uint64_t*)dst)[0] = hi;                                                                            \
+		((uint64_t*)c)[1] = atomicExch(((unsigned long long int*)dst) + 1, (unsigned long long int)lo) ^ lo; \
+	}
+
+#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
index 555ccbef2..a8dd1fcb2 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
@@ -2,7 +2,6 @@
 
 #include <stdint.h>
 
-
 __device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b)
 {
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
index 0d54f1436..1fc85b2d0 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -18,19 +18,19 @@ __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
 {
 	const uint32_t r = get_reciprocal(b);
 	const uint32_t a1 = ((uint32_t*)&a)[1];
-	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a;
+	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r)*a1) + a;
 
 	const uint32_t q = ((uint32_t*)&k)[1];
-	int64_t tmp = a - ((uint64_t)(q) * b);
+	int64_t tmp = a - ((uint64_t)(q)*b);
 	((int32_t*)(&tmp))[1] -= q < a1 ? b : 0;
-	
+
 	const int overshoot = ((int*)(&tmp))[1] >> 31;
 	const int64_t tmp_u = (uint32_t)(b - 1) - tmp;
 	const int undershoot = ((int*)&tmp_u)[1] >> 31;
 
 	uint64_t result;
 	((uint32_t*)&result)[0] = q + overshoot - undershoot;
-	((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b);
+	((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot)&b) - ((uint32_t)(undershoot)&b);
 
 	return result;
 }
@@ -39,14 +39,18 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
 {
 	float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23));
 	float x1;
-	asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x));
-	asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x));
+	asm("rsqrt.approx.f32 %0, %1;"
+		: "=f"(x1)
+		: "f"(x));
+	asm("sqrt.approx.f32 %0, %1;"
+		: "=f"(x)
+		: "f"(x));
 
 	// The following line does x1 *= 4294967296.0f;
 	x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23));
 
 	const uint32_t x0 = __float_as_uint(x) - (158U << 23);
-	const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18);
+	const int64_t delta0 = n1 - (((int64_t)(x0)*x0) << 18);
 	const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1;
 
 	uint32_t result = (x0 << 10) + __float2int_rn(delta);
@@ -56,6 +60,6 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
 	const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1;
 	const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0;
 	const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0;
-	result += (overshoot+undershoot);
+	result += (overshoot + undershoot);
 	return result;
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
index d5a98b7da..3bec5b1a2 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
@@ -4,173 +4,142 @@
 #define GROESTL_LENGTHFIELDLEN GROESTL_ROWS
 #define GROESTL_COLS512 8
 
-#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512)
+#define GROESTL_SIZE512 (GROESTL_ROWS * GROESTL_COLS512)
 
 #define GROESTL_ROUNDS512 10
 #define GROESTL_HASH_BIT_LEN 256
 
 #define GROESTL_ROTL32(v, n) ROTL32(v, n)
 
-
 #define li_32(h) 0x##h##u
-#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
+#define GROESTL_EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n)))
 
-#define u32BIG(a)	\
-	((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a,24) & li_32(FF00FF00)))
+#define u32BIG(a) \
+	((GROESTL_ROTL32(a, 8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a, 24) & li_32(FF00FF00)))
 
-typedef struct {
-	uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)];            /* actual state */
+typedef struct
+{
+	uint32_t chaining[GROESTL_SIZE512 / sizeof(uint32_t)]; /* actual state */
 	uint32_t block_counter1,
-	block_counter2;         /* message block counter(s) */
-	BitSequence buffer[GROESTL_SIZE512];      /* data buffer */
-	int buf_ptr;              /* data buffer pointer */
-	int bits_in_last_byte;    /* no. of message bits in last byte of data buffer */
+		block_counter2;					 /* message block counter(s) */
+	BitSequence buffer[GROESTL_SIZE512]; /* data buffer */
+	int buf_ptr;						 /* data buffer pointer */
+	int bits_in_last_byte;				 /* no. of message bits in last byte of data buffer */
 } groestlHashState;
 
-
 __constant__ uint32_t d_groestl_T[512] =
-{
-  0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
-, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
-, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
-, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
-, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
-, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
-, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
-, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
-, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
-, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
-, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
-, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
-, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
-, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
-, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
-, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
-, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
-, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
-, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
-, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
-, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
-, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
-, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
-, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
-, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
-, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
-, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
-, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
-, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
-, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
-, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
-, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e
-};
-
-#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \
-	{ temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
-		v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
-		v1 = temp_var; }
-
-#define GROESTL_COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \
-	tu = d_groestl_T[2*(uint32_t)x[4*c0+0]];	\
-	tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1];	\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
-	tu ^= tv1;									\
-	tl ^= tv2;									\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
-	tu ^= tv1;									\
-	tl ^= tv2;   								\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
-	tu ^= tv1;									\
-	tl ^= tv2;									\
-	tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]];	\
-	tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1];	\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
-	tl ^= tv1;									\
-	tu ^= tv2;									\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
-	tl ^= tv1;									\
-	tu ^= tv2;   								\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
-	tl ^= tv1;									\
-	tu ^= tv2;									\
-	y[i] = tu;									\
-	y[i+1] = tl;
-
-__device__ void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+	{
+		0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
+
+#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var)                \
+	{                                                                             \
+		temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \
+		v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes)));       \
+		v1 = temp_var;                                                            \
+	}
+
+#define GROESTL_COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \
+	tu = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0]];                                   \
+	tl = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0] + 1];                               \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tu ^= tv1;                                                                       \
+	tl ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tu ^= tv1;                                                                       \
+	tl ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tu ^= tv1;                                                                       \
+	tl ^= tv2;                                                                       \
+	tl ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0]];                                  \
+	tu ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0] + 1];                              \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tl ^= tv1;                                                                       \
+	tu ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tl ^= tv1;                                                                       \
+	tu ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tl ^= tv1;                                                                       \
+	tu ^= tv2;                                                                       \
+	y[i] = tu;                                                                       \
+	y[i + 1] = tl;
+
+__device__ void cn_groestl_RND512P(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r)
 {
 	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
 	uint32_t* x32 = (uint32_t*)x;
-	x32[ 0] ^= 0x00000000^r;
-	x32[ 2] ^= 0x00000010^r;
-	x32[ 4] ^= 0x00000020^r;
-	x32[ 6] ^= 0x00000030^r;
-	x32[ 8] ^= 0x00000040^r;
-	x32[10] ^= 0x00000050^r;
-	x32[12] ^= 0x00000060^r;
-	x32[14] ^= 0x00000070^r;
-	GROESTL_COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	x32[0] ^= 0x00000000 ^ r;
+	x32[2] ^= 0x00000010 ^ r;
+	x32[4] ^= 0x00000020 ^ r;
+	x32[6] ^= 0x00000030 ^ r;
+	x32[8] ^= 0x00000040 ^ r;
+	x32[10] ^= 0x00000050 ^ r;
+	x32[12] ^= 0x00000060 ^ r;
+	x32[14] ^= 0x00000070 ^ r;
+	GROESTL_COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
-__device__ void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+__device__ void cn_groestl_RND512Q(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r)
 {
 	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
 	uint32_t* x32 = (uint32_t*)x;
-	x32[ 0] = ~x32[ 0];
-	x32[ 1] ^= 0xffffffff^r;
-	x32[ 2] = ~x32[ 2];
-	x32[ 3] ^= 0xefffffff^r;
-	x32[ 4] = ~x32[ 4];
-	x32[ 5] ^= 0xdfffffff^r;
-	x32[ 6] = ~x32[ 6];
-	x32[ 7] ^= 0xcfffffff^r;
-	x32[ 8] = ~x32[ 8];
-	x32[ 9] ^= 0xbfffffff^r;
+	x32[0] = ~x32[0];
+	x32[1] ^= 0xffffffff ^ r;
+	x32[2] = ~x32[2];
+	x32[3] ^= 0xefffffff ^ r;
+	x32[4] = ~x32[4];
+	x32[5] ^= 0xdfffffff ^ r;
+	x32[6] = ~x32[6];
+	x32[7] ^= 0xcfffffff ^ r;
+	x32[8] = ~x32[8];
+	x32[9] ^= 0xbfffffff ^ r;
 	x32[10] = ~x32[10];
-	x32[11] ^= 0xafffffff^r;
+	x32[11] ^= 0xafffffff ^ r;
 	x32[12] = ~x32[12];
-	x32[13] ^= 0x9fffffff^r;
+	x32[13] ^= 0x9fffffff ^ r;
 	x32[14] = ~x32[14];
-	x32[15] ^= 0x8fffffff^r;
-	GROESTL_COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	x32[15] ^= 0x8fffffff ^ r;
+	GROESTL_COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
-__device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m)
+__device__ void cn_groestl_F512(uint32_t* __restrict__ h, const uint32_t* __restrict__ m)
 {
 	int i;
-	uint32_t Ptmp[2*GROESTL_COLS512];
-	uint32_t Qtmp[2*GROESTL_COLS512];
-	uint32_t y[2*GROESTL_COLS512];
-	uint32_t z[2*GROESTL_COLS512];
+	uint32_t Ptmp[2 * GROESTL_COLS512];
+	uint32_t Qtmp[2 * GROESTL_COLS512];
+	uint32_t y[2 * GROESTL_COLS512];
+	uint32_t z[2 * GROESTL_COLS512];
 
-	for (i = 0; i < 2*GROESTL_COLS512; i++)
+	for(i = 0; i < 2 * GROESTL_COLS512; i++)
 	{
 		z[i] = m[i];
-		Ptmp[i] = h[i]^m[i];
+		Ptmp[i] = h[i] ^ m[i];
 	}
 
 	cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000);
@@ -195,18 +164,18 @@ __device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __re
 	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
 	cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009);
 
-	for (i = 0; i < 2*GROESTL_COLS512; i++)
-		h[i] ^= Ptmp[i]^Qtmp[i];
+	for(i = 0; i < 2 * GROESTL_COLS512; i++)
+		h[i] ^= Ptmp[i] ^ Qtmp[i];
 }
 
-__device__ void cn_groestl_outputtransformation(groestlHashState *ctx)
+__device__ void cn_groestl_outputtransformation(groestlHashState* ctx)
 {
 	int j;
-	uint32_t temp[2*GROESTL_COLS512];
-	uint32_t y[2*GROESTL_COLS512];
-	uint32_t z[2*GROESTL_COLS512];
+	uint32_t temp[2 * GROESTL_COLS512];
+	uint32_t y[2 * GROESTL_COLS512];
+	uint32_t z[2 * GROESTL_COLS512];
 
-	for (j = 0; j < 2*GROESTL_COLS512; j++)
+	for(j = 0; j < 2 * GROESTL_COLS512; j++)
 		temp[j] = ctx->chaining[j];
 
 	cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000);
@@ -220,33 +189,33 @@ __device__ void cn_groestl_outputtransformation(groestlHashState *ctx)
 	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
 	cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009);
 
-	for (j = 0; j < 2*GROESTL_COLS512; j++)
+	for(j = 0; j < 2 * GROESTL_COLS512; j++)
 		ctx->chaining[j] ^= temp[j];
 }
 
-__device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx,
-	const uint8_t * __restrict__ input, int msglen)
+__device__ void cn_groestl_transform(groestlHashState* __restrict__ ctx,
+	const uint8_t* __restrict__ input, int msglen)
 {
-	for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512)
+	for(; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512)
 	{
-		cn_groestl_F512(ctx->chaining,(uint32_t*)input);
+		cn_groestl_F512(ctx->chaining, (uint32_t*)input);
 		ctx->block_counter1++;
 
-		if (ctx->block_counter1 == 0)
+		if(ctx->block_counter1 == 0)
 			ctx->block_counter2++;
 	}
 }
 
-__device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
-	BitSequence* __restrict__  output)
+__device__ void cn_groestl_final(groestlHashState* __restrict__ ctx,
+	BitSequence* __restrict__ output)
 {
-	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8;
-	uint8_t *s = (BitSequence*)ctx->chaining;
+	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN / 8;
+	uint8_t* s = (BitSequence*)ctx->chaining;
 
-	if (ctx->bits_in_last_byte)
+	if(ctx->bits_in_last_byte)
 	{
-		ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte);
-		ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte);
+		ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << ctx->bits_in_last_byte) - 1) << (8 - ctx->bits_in_last_byte);
+		ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - ctx->bits_in_last_byte);
 		ctx->bits_in_last_byte = 0;
 	}
 	else
@@ -254,29 +223,29 @@ __device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
 		ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
 	}
 
-	if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	if(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN)
 	{
-		while (ctx->buf_ptr < GROESTL_SIZE512)
+		while(ctx->buf_ptr < GROESTL_SIZE512)
 			ctx->buffer[(int)ctx->buf_ptr++] = 0;
 
 		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
 		ctx->buf_ptr = 0;
 	}
 
-	while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	while(ctx->buf_ptr < GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN)
 		ctx->buffer[(int)ctx->buf_ptr++] = 0;
 
 	ctx->block_counter1++;
-	if (ctx->block_counter1 == 0)
+	if(ctx->block_counter1 == 0)
 		ctx->block_counter2++;
 	ctx->buf_ptr = GROESTL_SIZE512;
 
-	while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t))
+	while(ctx->buf_ptr > GROESTL_SIZE512 - (int)sizeof(uint32_t))
 	{
 		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
 		ctx->block_counter1 >>= 8;
 	}
-	while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	while(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN)
 	{
 		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
 		ctx->block_counter2 >>= 8;
@@ -284,12 +253,12 @@ __device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
 	cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
 	cn_groestl_outputtransformation(ctx);
 
-	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++)
+	for(i = GROESTL_SIZE512 - hashbytelen; i < GROESTL_SIZE512; i++, j++)
 		output[j] = s[i];
 
-	for (i = 0; i < GROESTL_COLS512; i++)
+	for(i = 0; i < GROESTL_COLS512; i++)
 		ctx->chaining[i] = 0;
-	for (i = 0; i < GROESTL_SIZE512; i++)
+	for(i = 0; i < GROESTL_SIZE512; i++)
 		ctx->buffer[i] = 0;
 }
 
@@ -297,17 +266,17 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx,
 	const BitSequence* __restrict__ input, DataLength databitlen)
 {
 	int index = 0;
-	int msglen = (int)(databitlen/8);
-	int rem = (int)(databitlen%8);
+	int msglen = (int)(databitlen / 8);
+	int rem = (int)(databitlen % 8);
 
-	if (ctx->buf_ptr)
+	if(ctx->buf_ptr)
 	{
-		while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen)
+		while(ctx->buf_ptr < GROESTL_SIZE512 && index < msglen)
 			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
 
-		if (ctx->buf_ptr < GROESTL_SIZE512)
+		if(ctx->buf_ptr < GROESTL_SIZE512)
 		{
-			if (rem)
+			if(rem)
 			{
 				ctx->bits_in_last_byte = rem;
 				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
@@ -319,13 +288,13 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx,
 		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
 	}
 
-	cn_groestl_transform(ctx, input+index, msglen-index);
-	index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512;
+	cn_groestl_transform(ctx, input + index, msglen - index);
+	index += ((msglen - index) / GROESTL_SIZE512) * GROESTL_SIZE512;
 
-	while (index < msglen)
+	while(index < msglen)
 		ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
 
-	if (rem)
+	if(rem)
 	{
 		ctx->bits_in_last_byte = rem;
 		ctx->buffer[(int)ctx->buf_ptr++] = input[index];
@@ -336,17 +305,17 @@ __device__ void cn_groestl_init(groestlHashState* ctx)
 {
 	int i = 0;
 
-	for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++)
+	for(; i < (GROESTL_SIZE512 / sizeof(uint32_t)); i++)
 		ctx->chaining[i] = 0;
 
-	ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
+	ctx->chaining[2 * GROESTL_COLS512 - 1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
 	ctx->buf_ptr = 0;
 	ctx->block_counter1 = 0;
 	ctx->block_counter2 = 0;
 	ctx->bits_in_last_byte = 0;
 }
 
-__device__ void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+__device__ void cn_groestl(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval)
 {
 	DataLength databitlen = len << 3;
 	groestlHashState context;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
index 284039ff4..1019a9b9c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
@@ -1,6 +1,7 @@
 #include <stdint.h>
 
-typedef struct {
+typedef struct
+{
 	int hashbitlen;
 	unsigned long long databitlen;
 	unsigned long long datasize_in_buffer;
@@ -9,159 +10,175 @@ typedef struct {
 } jhHashState;
 
 __constant__ unsigned char d_JH256_H0[512] =
-{
-	0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
-	0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
-	0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
-	0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
-	0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
-	0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
-	0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
-	0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69
-};
+	{
+		0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
+		0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
+		0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
+		0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
+		0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
+		0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
+		0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
+		0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69};
 
 __constant__ unsigned char d_E8_rc[42][32] =
-{
-	{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
-	{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
-	{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
-	{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
-	{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
-	{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
-	{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
-	{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
-	{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
-	{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
-	{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
-	{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
-	{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
-	{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
-	{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
-	{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
-	{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
-	{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
-	{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
-	{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
-	{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
-	{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
-	{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
-	{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
-	{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
-	{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
-	{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
-	{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
-	{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
-	{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
-	{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
-	{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
-	{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
-	{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
-	{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
-	{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
-	{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
-	{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
-	{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
-	{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
-	{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
-	{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}
-};
-
-#define JH_SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
-#define JH_SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
-#define JH_SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
-#define JH_SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
-#define JH_SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
-#define JH_SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
-
-#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \
-	(m4) ^= (m1);                \
-	(m5) ^= (m2);                \
-	(m6) ^= (m0) ^ (m3);         \
-	(m7) ^= (m0);                \
-	(m0) ^= (m5);                \
-	(m1) ^= (m6);                \
-	(m2) ^= (m4) ^ (m7);         \
+	{
+		{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
+		{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
+		{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
+		{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
+		{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
+		{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
+		{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
+		{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
+		{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
+		{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
+		{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
+		{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
+		{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
+		{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
+		{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
+		{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
+		{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
+		{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
+		{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
+		{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
+		{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
+		{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
+		{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
+		{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
+		{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
+		{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
+		{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
+		{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
+		{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
+		{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
+		{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
+		{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
+		{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
+		{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
+		{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
+		{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
+		{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
+		{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
+		{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
+		{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
+		{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
+		{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}};
+
+#define JH_SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1));
+#define JH_SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2));
+#define JH_SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4));
+#define JH_SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8));
+#define JH_SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16));
+#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32));
+
+#define JH_L(m0, m1, m2, m3, m4, m5, m6, m7) \
+	(m4) ^= (m1);                            \
+	(m5) ^= (m2);                            \
+	(m6) ^= (m0) ^ (m3);                     \
+	(m7) ^= (m0);                            \
+	(m0) ^= (m5);                            \
+	(m1) ^= (m6);                            \
+	(m2) ^= (m4) ^ (m7);                     \
 	(m3) ^= (m4);
 
-#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
-	m3  = ~(m3);                  \
-	m7  = ~(m7);                  \
-	m0 ^= ((~(m2)) & (cc0));      \
-	m4 ^= ((~(m6)) & (cc1));      \
-	temp0 = (cc0) ^ ((m0) & (m1));\
-	temp1 = (cc1) ^ ((m4) & (m5));\
-	m0 ^= ((m2) & (m3));          \
-	m4 ^= ((m6) & (m7));          \
-	m3 ^= ((~(m1)) & (m2));       \
-	m7 ^= ((~(m5)) & (m6));       \
-	m1 ^= ((m0) & (m2));          \
-	m5 ^= ((m4) & (m6));          \
-	m2 ^= ((m0) & (~(m3)));       \
-	m6 ^= ((m4) & (~(m7)));       \
-	m0 ^= ((m1) | (m3));          \
-	m4 ^= ((m5) | (m7));          \
-	m3 ^= ((m1) & (m2));          \
-	m7 ^= ((m5) & (m6));          \
-	m1 ^= (temp0 & (m0));         \
-	m5 ^= (temp1 & (m4));         \
-	m2 ^= temp0;                  \
+#define JH_SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \
+	m3 = ~(m3);                                         \
+	m7 = ~(m7);                                         \
+	m0 ^= ((~(m2)) & (cc0));                            \
+	m4 ^= ((~(m6)) & (cc1));                            \
+	temp0 = (cc0) ^ ((m0) & (m1));                      \
+	temp1 = (cc1) ^ ((m4) & (m5));                      \
+	m0 ^= ((m2) & (m3));                                \
+	m4 ^= ((m6) & (m7));                                \
+	m3 ^= ((~(m1)) & (m2));                             \
+	m7 ^= ((~(m5)) & (m6));                             \
+	m1 ^= ((m0) & (m2));                                \
+	m5 ^= ((m4) & (m6));                                \
+	m2 ^= ((m0) & (~(m3)));                             \
+	m6 ^= ((m4) & (~(m7)));                             \
+	m0 ^= ((m1) | (m3));                                \
+	m4 ^= ((m5) | (m7));                                \
+	m3 ^= ((m1) & (m2));                                \
+	m7 ^= ((m5) & (m6));                                \
+	m1 ^= (temp0 & (m0));                               \
+	m5 ^= (temp1 & (m4));                               \
+	m2 ^= temp0;                                        \
 	m6 ^= temp1;
 
-__device__ void cn_jh_E8(jhHashState *state)
+__device__ void cn_jh_E8(jhHashState* state)
 {
-	uint64_t i,roundnumber,temp0,temp1;
+	uint64_t i, roundnumber, temp0, temp1;
 
-	for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7)
+	for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7)
 	{
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP1(state->x[1][i]);
+			JH_SWAP1(state->x[3][i]);
+			JH_SWAP1(state->x[5][i]);
+			JH_SWAP1(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP2(state->x[1][i]);
+			JH_SWAP2(state->x[3][i]);
+			JH_SWAP2(state->x[5][i]);
+			JH_SWAP2(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP4(state->x[1][i]);
+			JH_SWAP4(state->x[3][i]);
+			JH_SWAP4(state->x[5][i]);
+			JH_SWAP4(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP8(state->x[1][i]);
+			JH_SWAP8(state->x[3][i]);
+			JH_SWAP8(state->x[5][i]);
+			JH_SWAP8(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP16(state->x[1][i]);
+			JH_SWAP16(state->x[3][i]);
+			JH_SWAP16(state->x[5][i]);
+			JH_SWAP16(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP32(state->x[1][i]);
+			JH_SWAP32(state->x[3][i]);
+			JH_SWAP32(state->x[5][i]);
+			JH_SWAP32(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
 		}
 
-		for (i = 1; i < 8; i = i+2)
+		for(i = 1; i < 8; i = i + 2)
 		{
 			temp0 = state->x[i][0];
 			state->x[i][0] = state->x[i][1];
@@ -170,75 +187,75 @@ __device__ void cn_jh_E8(jhHashState *state)
 	}
 }
 
-__device__ void cn_jh_F8(jhHashState *state)
+__device__ void cn_jh_F8(jhHashState* state)
 {
 	uint64_t i;
 
-	for (i = 0; i < 8; i++)
-		state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i];
+	for(i = 0; i < 8; i++)
+		state->x[i >> 1][i & 1] ^= ((uint64_t*)state->buffer)[i];
 
 	cn_jh_E8(state);
 
-	for (i = 0; i < 8; i++)
-		state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i];
+	for(i = 0; i < 8; i++)
+		state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64_t*)state->buffer)[i];
 }
 
-__device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+__device__ void cn_jh_update(jhHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen)
 {
 	DataLength index;
 
 	state->databitlen += databitlen;
 	index = 0;
 
-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  )
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512))
 	{
-		if ( (databitlen & 7) == 0 )
-			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3));
+		if((databitlen & 7) == 0)
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
 		else
-			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1);
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1);
 		state->datasize_in_buffer += databitlen;
 		databitlen = 0;
 	}
 
-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512))
 	{
-		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
-		index = 64-(state->datasize_in_buffer >> 3);
+		memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
+		index = 64 - (state->datasize_in_buffer >> 3);
 		databitlen = databitlen - (512 - state->datasize_in_buffer);
 		cn_jh_F8(state);
 		state->datasize_in_buffer = 0;
 	}
 
-	for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512)
+	for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512)
 	{
-		memcpy(state->buffer, data+index, 64);
+		memcpy(state->buffer, data + index, 64);
 		cn_jh_F8(state);
 	}
 
-	if ( databitlen > 0)
+	if(databitlen > 0)
 	{
-		if ((databitlen & 7) == 0)
-			memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
+		if((databitlen & 7) == 0)
+			memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3);
 		else
-			memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
+			memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1);
 		state->datasize_in_buffer = databitlen;
 	}
 }
 
 /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
-__device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval)
+__device__ void cn_jh_final(jhHashState* __restrict__ state, BitSequence* __restrict__ hashval)
 {
 	unsigned int i;
 	//uint32_t *bufptr = (uint32_t *)state->buffer;
 
-	if ( (state->databitlen & 0x1ff) == 0 )
+	if((state->databitlen & 0x1ff) == 0)
 	{
 		/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
 		memset(state->buffer, 0, 64);
 		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
-		state->buffer[0]  = 0x80;
+		state->buffer[0] = 0x80;
 		state->buffer[63] = state->databitlen & 0xff;
-		state->buffer[62] = (state->databitlen >> 8)  & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
 		state->buffer[61] = (state->databitlen >> 16) & 0xff;
 		state->buffer[60] = (state->databitlen >> 24) & 0xff;
 		state->buffer[59] = (state->databitlen >> 32) & 0xff;
@@ -250,19 +267,19 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re
 	else
 	{
 		/*set the rest of the bytes in the buffer to 0*/
-		if ( (state->datasize_in_buffer & 7) == 0)
+		if((state->datasize_in_buffer & 7) == 0)
 		{
-			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
+			for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
 				state->buffer[i] = 0;
 		}
 		else
 		{
-			for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)
+			for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++)
 				state->buffer[i] = 0;
 		}
 
 		/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
-		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
+		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7));
 
 		cn_jh_F8(state);
 		memset(state->buffer, 0, 64);
@@ -278,10 +295,10 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re
 		cn_jh_F8(state);
 	}
 
-	memcpy(hashval,(unsigned char*)state->x+64+32,32);
+	memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32);
 }
 
-__device__ void cn_jh_init(jhHashState *state, int hashbitlen)
+__device__ void cn_jh_init(jhHashState* state, int hashbitlen)
 {
 	state->databitlen = 0;
 	state->datasize_in_buffer = 0;
@@ -289,7 +306,7 @@ __device__ void cn_jh_init(jhHashState *state, int hashbitlen)
 	memcpy(state->x, d_JH256_H0, 128);
 }
 
-__device__ void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+__device__ void cn_jh(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval)
 {
 	int hashbitlen = 256;
 	DataLength databitlen = len << 3;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
index 3f535631d..5bbc787e3 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
@@ -7,46 +7,61 @@ __constant__
 #else
 const
 #endif
-uint64_t keccakf_rndc[24] ={
-	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
-};
+	uint64_t keccakf_rndc[24] = {
+		0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+		0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+		0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+		0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+		0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+		0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+		0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+		0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
 
 #if __CUDA_ARCH__ >= 350
-	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
-	{
-		uint2 result;
-		if(offset >= 32)
-		{
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		}
-		else
-		{
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		}
-		return  __double_as_longlong(__hiloint2double(result.y, result.x));
-	}
-	#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
+/** @param offset must be < 32
+ */
+__forceinline__ __device__ uint64_t cuda_rotl64(const uint32_t v0, const uint32_t v1, const int offset)
+{
+	uint2 result;
+
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+		: "=r"(result.y)
+		: "r"(v0), "r"(v1), "r"(offset));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+		: "=r"(result.x)
+		: "r"(v1), "r"(v0), "r"(offset));
+
+	return *((uint64_t*)&result);
+}
+__device__ __forceinline__ uint64_t rotl64_1(const uint64_t x, const int y)
+{
+	return cuda_rotl64(((uint32_t*)&x)[0], ((uint32_t*)&x)[1], (y));
+}
+
+__device__ __forceinline__ uint64_t rotl64_2(const uint64_t x, const int y)
+{
+	return cuda_rotl64(((uint32_t*)&x)[1], ((uint32_t*)&x)[0], (y));
+}
+
 #else
-	#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
+
+#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
+__device__ __forceinline__ uint64_t rotl64_2(const uint64_t x, const int y)
+{
+	uint64_t tmp;
+	((uint32_t*)&tmp)[0] = ((uint32_t*)&x)[1];
+	((uint32_t*)&tmp)[1] = ((uint32_t*)&x)[0];
+
+	return rotl64_1(tmp, (y));
+}
 #endif
 
-#define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y))
+
 #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 
-__device__ __forceinline__ void cn_keccakf2(uint64_t *s)
+__device__ __forceinline__ void cn_keccakf2(uint64_t* s)
 {
-	uint8_t i;
-
-	for(i = 0; i < 24; ++i)
+	for(int16_t i = 0; i < 24; ++i)
 	{
 		uint64_t bc[5], tmpxor[5], tmp1, tmp2;
 
@@ -90,16 +105,46 @@ __device__ __forceinline__ void cn_keccakf2(uint64_t *s)
 		s[7] = rotl64_1(s[10] ^ bc[4], 3);
 		s[10] = rotl64_1(tmp1, 1);
 
-		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
-		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
-		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
-		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
-		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		tmp1 = s[0];
+		tmp2 = s[1];
+		s[0] = bitselect(s[0] ^ s[2], s[0], s[1]);
+		s[1] = bitselect(s[1] ^ s[3], s[1], s[2]);
+		s[2] = bitselect(s[2] ^ s[4], s[2], s[3]);
+		s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]);
+		s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5];
+		tmp2 = s[6];
+		s[5] = bitselect(s[5] ^ s[7], s[5], s[6]);
+		s[6] = bitselect(s[6] ^ s[8], s[6], s[7]);
+		s[7] = bitselect(s[7] ^ s[9], s[7], s[8]);
+		s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]);
+		s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10];
+		tmp2 = s[11];
+		s[10] = bitselect(s[10] ^ s[12], s[10], s[11]);
+		s[11] = bitselect(s[11] ^ s[13], s[11], s[12]);
+		s[12] = bitselect(s[12] ^ s[14], s[12], s[13]);
+		s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]);
+		s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15];
+		tmp2 = s[16];
+		s[15] = bitselect(s[15] ^ s[17], s[15], s[16]);
+		s[16] = bitselect(s[16] ^ s[18], s[16], s[17]);
+		s[17] = bitselect(s[17] ^ s[19], s[17], s[18]);
+		s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]);
+		s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20];
+		tmp2 = s[21];
+		s[20] = bitselect(s[20] ^ s[22], s[20], s[21]);
+		s[21] = bitselect(s[21] ^ s[23], s[21], s[22]);
+		s[22] = bitselect(s[22] ^ s[24], s[22], s[23]);
+		s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]);
+		s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
 		s[0] ^= keccakf_rndc[i];
 	}
 }
 
-__device__ __forceinline__ void cn_keccakf(uint64_t *s)
+__device__ __forceinline__ void cn_keccakf(uint64_t* s)
 {
 	uint64_t bc[5], tmpxor[5], tmp1, tmp2;
 
@@ -145,16 +190,46 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s)
 		s[7] = rotl64_1(s[10] ^ bc[4], 3);
 		s[10] = rotl64_1(tmp1, 1);
 
-		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
-		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
-		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
-		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
-		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		tmp1 = s[0];
+		tmp2 = s[1];
+		s[0] = bitselect(s[0] ^ s[2], s[0], s[1]);
+		s[1] = bitselect(s[1] ^ s[3], s[1], s[2]);
+		s[2] = bitselect(s[2] ^ s[4], s[2], s[3]);
+		s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]);
+		s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5];
+		tmp2 = s[6];
+		s[5] = bitselect(s[5] ^ s[7], s[5], s[6]);
+		s[6] = bitselect(s[6] ^ s[8], s[6], s[7]);
+		s[7] = bitselect(s[7] ^ s[9], s[7], s[8]);
+		s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]);
+		s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10];
+		tmp2 = s[11];
+		s[10] = bitselect(s[10] ^ s[12], s[10], s[11]);
+		s[11] = bitselect(s[11] ^ s[13], s[11], s[12]);
+		s[12] = bitselect(s[12] ^ s[14], s[12], s[13]);
+		s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]);
+		s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15];
+		tmp2 = s[16];
+		s[15] = bitselect(s[15] ^ s[17], s[15], s[16]);
+		s[16] = bitselect(s[16] ^ s[18], s[16], s[17]);
+		s[17] = bitselect(s[17] ^ s[19], s[17], s[18]);
+		s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]);
+		s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20];
+		tmp2 = s[21];
+		s[20] = bitselect(s[20] ^ s[22], s[20], s[21]);
+		s[21] = bitselect(s[21] ^ s[23], s[21], s[22]);
+		s[22] = bitselect(s[22] ^ s[24], s[22], s[23]);
+		s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]);
+		s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
 		s[0] ^= keccakf_rndc[i];
 	}
 }
 
-__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md)
+__device__ __forceinline__ void cn_keccak(const uint8_t* __restrict__ in, uint32_t len, uint8_t* __restrict__ md)
 {
 	uint64_t st[25];
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
index fc45db1ae..b8073f03b 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
@@ -1,124 +1,146 @@
 #pragma once
 
-typedef unsigned int    uint_t;             /* native unsigned integer */
+typedef unsigned int uint_t; /* native unsigned integer */
 
-#define SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */
 
-#define SKEIN_256_STATE_WORDS ( 4)
-#define SKEIN_512_STATE_WORDS ( 8)
+#define SKEIN_256_STATE_WORDS (4)
+#define SKEIN_512_STATE_WORDS (8)
 #define SKEIN1024_STATE_WORDS (16)
 
-#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
-#define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
-#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32))
+#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
 
-#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word  */
 
-#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
-#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
-#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
-#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126)	/* bits 126     : first block flag         */
+#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119)  /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127)	/* bit  127     : final block flag         */
+#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field               */
 
-#define SKEIN_T1_FLAG_FIRST     (((uint64_t)  1 ) << SKEIN_T1_POS_FIRST)
-#define SKEIN_T1_FLAG_BIT_PAD   (((uint64_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
-#define SKEIN_T1_FLAG_FINAL     (((uint64_t)  1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_FIRST (((uint64_t)1) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t)1) << SKEIN_T1_POS_BIT_PAD)
+#define SKEIN_T1_FLAG_FINAL (((uint64_t)1) << SKEIN_T1_POS_FINAL)
 
-#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
-#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+#define SKEIN_BLK_TYPE_MSG (48) /* message processing */
+#define SKEIN_BLK_TYPE_OUT (63) /* output stage */
 
-#define SKEIN_T1_BLK_TYPE(T)   (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE(T) (((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
 
-#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
-#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */
 
-#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
 
-#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
-
-#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
-#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
-
-#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \
-  Skein_Set_T0(ctxPtr,(T0)); \
-  Skein_Set_T1(ctxPtr,(T1)); }
-
-#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
-{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
-
-#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \
+	{                                          \
+		(ctxPtr)->h.T[TWK_NUM] = (tVal);       \
+	}
 
-#define KW_TWK_BASE     (0)
-#define KW_KEY_BASE     (3)
-#define ks              (kw + KW_KEY_BASE)
-#define ts              (kw + KW_TWK_BASE)
+#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0)
+#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1)
 
-#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \
-	X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \
-	X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \
-	X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \
-	X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6;
+#define Skein_Set_T0_T1(ctxPtr, T0, T1) \
+	{                                   \
+		Skein_Set_T0(ctxPtr, (T0));     \
+		Skein_Set_T1(ctxPtr, (T1));     \
+	}
 
-#define I512(R) \
-	X0   += ks[((R)+1) % 9]; \
-	X1   += ks[((R)+2) % 9]; \
-	X2   += ks[((R)+3) % 9]; \
-	X3   += ks[((R)+4) % 9]; \
-	X4   += ks[((R)+5) % 9]; \
-	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
-	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
-	X7   += ks[((R)+8) % 9] + (R)+1;
+#define Skein_Start_New_Type(ctxPtr, BLK_TYPE)                                          \
+	{                                                                                   \
+		Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \
+		(ctxPtr)->h.bCnt = 0;                                                           \
+	}
 
+#define Skein_Set_Bit_Pad_Flag(hdr)          \
+	{                                        \
+		(hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \
+	}
 
-#define R512_8_rounds(R) \
-	R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
-	R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
-	R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
-	R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
-	I512(2*(R)); \
-	R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
-	R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
-	R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
-	R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
-	I512(2*(R)+1);
+#define KW_TWK_BASE (0)
+#define KW_KEY_BASE (3)
+#define ks (kw + KW_KEY_BASE)
+#define ts (kw + KW_TWK_BASE)
+
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, R512ROT, rNum) \
+	X##p0 += X##p1;                                         \
+	X##p1 = ROTL64(X##p1, R512ROT##_0);                     \
+	X##p1 ^= X##p0;                                         \
+	X##p2 += X##p3;                                         \
+	X##p3 = ROTL64(X##p3, R512ROT##_1);                     \
+	X##p3 ^= X##p2;                                         \
+	X##p4 += X##p5;                                         \
+	X##p5 = ROTL64(X##p5, R512ROT##_2);                     \
+	X##p5 ^= X##p4;                                         \
+	X##p6 += X##p7;                                         \
+	X##p7 = ROTL64(X##p7, R512ROT##_3);                     \
+	X##p7 ^= X##p6;
+
+#define I512(R)                                  \
+	X0 += ks[((R) + 1) % 9];                     \
+	X1 += ks[((R) + 2) % 9];                     \
+	X2 += ks[((R) + 3) % 9];                     \
+	X3 += ks[((R) + 4) % 9];                     \
+	X4 += ks[((R) + 5) % 9];                     \
+	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \
+	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \
+	X7 += ks[((R) + 8) % 9] + (R) + 1;
+
+#define R512_8_rounds(R)                                \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
+	I512(2 * (R));                                      \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
+	I512(2 * (R) + 1);
 
 typedef struct
 {
-	size_t  hashBitLen;
-	size_t  bCnt;
-	uint64_t  T[SKEIN_MODIFIER_WORDS];
+	size_t hashBitLen;
+	size_t bCnt;
+	uint64_t T[SKEIN_MODIFIER_WORDS];
 } Skein_Ctxt_Hdr_t;
 
-typedef struct {
+typedef struct
+{
 	Skein_Ctxt_Hdr_t h;
-	uint64_t  X[SKEIN_256_STATE_WORDS];
-	uint8_t  b[SKEIN_256_BLOCK_BYTES];
+	uint64_t X[SKEIN_256_STATE_WORDS];
+	uint8_t b[SKEIN_256_BLOCK_BYTES];
 } Skein_256_Ctxt_t;
 
-typedef struct {
+typedef struct
+{
 	Skein_Ctxt_Hdr_t h;
-	uint64_t  X[SKEIN_512_STATE_WORDS];
-	uint8_t  b[SKEIN_512_BLOCK_BYTES];
+	uint64_t X[SKEIN_512_STATE_WORDS];
+	uint8_t b[SKEIN_512_BLOCK_BYTES];
 } Skein_512_Ctxt_t;
 
-typedef struct {
+typedef struct
+{
 	Skein_Ctxt_Hdr_t h;
-	uint64_t  X[SKEIN1024_STATE_WORDS];
-	uint8_t  b[SKEIN1024_BLOCK_BYTES];
+	uint64_t X[SKEIN1024_STATE_WORDS];
+	uint8_t b[SKEIN1024_BLOCK_BYTES];
 } Skein1024_Ctxt_t;
 
-typedef struct {
-	uint_t  statebits;
+typedef struct
+{
+	uint_t statebits;
 	union {
 		Skein_Ctxt_Hdr_t h;
 		Skein_256_Ctxt_t ctx_256;
@@ -127,21 +149,20 @@ typedef struct {
 	} u;
 } skeinHashState;
 
-__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen)
+__device__ void cn_skein_init(skeinHashState* state, size_t hashBitLen)
 {
 	const uint64_t SKEIN_512_IV_256[] =
-	{
-		SKEIN_MK_64(0xCCD044A1,0x2FDB3E13),
-		SKEIN_MK_64(0xE8359030,0x1A79A9EB),
-		SKEIN_MK_64(0x55AEA061,0x4F816E6F),
-		SKEIN_MK_64(0x2A2767A4,0xAE9B94DB),
-		SKEIN_MK_64(0xEC06025E,0x74DD7683),
-		SKEIN_MK_64(0xE7A436CD,0xC4746251),
-		SKEIN_MK_64(0xC36FBAF9,0x393AD185),
-		SKEIN_MK_64(0x3EEDBA18,0x33EDFC13)
-	};
+		{
+			SKEIN_MK_64(0xCCD044A1, 0x2FDB3E13),
+			SKEIN_MK_64(0xE8359030, 0x1A79A9EB),
+			SKEIN_MK_64(0x55AEA061, 0x4F816E6F),
+			SKEIN_MK_64(0x2A2767A4, 0xAE9B94DB),
+			SKEIN_MK_64(0xEC06025E, 0x74DD7683),
+			SKEIN_MK_64(0xE7A436CD, 0xC4746251),
+			SKEIN_MK_64(0xC36FBAF9, 0x393AD185),
+			SKEIN_MK_64(0x3EEDBA18, 0x33EDFC13)};
 
-	Skein_512_Ctxt_t *ctx = &state->u.ctx_512;
+	Skein_512_Ctxt_t* ctx = &state->u.ctx_512;
 
 	ctx->h.hashBitLen = hashBitLen;
 
@@ -150,22 +171,47 @@ __device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen)
 	Skein_Start_New_Type(ctx, MSG);
 }
 
-__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
+__device__ void cn_skein512_processblock(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
 {
-	enum {
-		R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
-		R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
-		R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
-		R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
-		R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
-		R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
-		R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
-		R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22
+	enum
+	{
+		R_512_0_0 = 46,
+		R_512_0_1 = 36,
+		R_512_0_2 = 19,
+		R_512_0_3 = 37,
+		R_512_1_0 = 33,
+		R_512_1_1 = 27,
+		R_512_1_2 = 14,
+		R_512_1_3 = 42,
+		R_512_2_0 = 17,
+		R_512_2_1 = 49,
+		R_512_2_2 = 36,
+		R_512_2_3 = 39,
+		R_512_3_0 = 44,
+		R_512_3_1 = 9,
+		R_512_3_2 = 54,
+		R_512_3_3 = 56,
+		R_512_4_0 = 39,
+		R_512_4_1 = 30,
+		R_512_4_2 = 34,
+		R_512_4_3 = 24,
+		R_512_5_0 = 13,
+		R_512_5_1 = 50,
+		R_512_5_2 = 10,
+		R_512_5_3 = 17,
+		R_512_6_0 = 25,
+		R_512_6_1 = 29,
+		R_512_6_2 = 39,
+		R_512_6_3 = 43,
+		R_512_7_0 = 8,
+		R_512_7_1 = 35,
+		R_512_7_2 = 56,
+		R_512_7_3 = 22
 	};
 
-	uint64_t X0,X1,X2,X3,X4,X5,X6,X7;
+	uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
 	uint64_t w[SKEIN_512_STATE_WORDS];
-	uint64_t kw[SKEIN_512_STATE_WORDS+4];
+	uint64_t kw[SKEIN_512_STATE_WORDS + 4];
 
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
@@ -184,7 +230,7 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co
 		ks[6] = ctx->X[6];
 		ks[7] = ctx->X[7];
 		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
-		ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+				ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
 
 		ts[2] = ts[0] ^ ts[1];
 
@@ -201,15 +247,15 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co
 
 		blkPtr += SKEIN_512_BLOCK_BYTES;
 
-		R512_8_rounds( 0);
-		R512_8_rounds( 1);
-		R512_8_rounds( 2);
-		R512_8_rounds( 3);
-		R512_8_rounds( 4);
-		R512_8_rounds( 5);
-		R512_8_rounds( 6);
-		R512_8_rounds( 7);
-		R512_8_rounds( 8);
+		R512_8_rounds(0);
+		R512_8_rounds(1);
+		R512_8_rounds(2);
+		R512_8_rounds(3);
+		R512_8_rounds(4);
+		R512_8_rounds(5);
+		R512_8_rounds(6);
+		R512_8_rounds(7);
+		R512_8_rounds(8);
 
 		ctx->X[0] = X0 ^ w[0];
 		ctx->X[1] = X1 ^ w[1];
@@ -221,125 +267,124 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co
 		ctx->X[7] = X7 ^ w[7];
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
-	}
-	while (--blkCnt);
+	} while(--blkCnt);
 
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
 }
 
-__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
+__device__ void cn_skein_final(skeinHashState* __restrict__ state, uint8_t* __restrict__ hashVal)
 {
-	size_t i,n,byteCnt;
+	size_t i, n, byteCnt;
 	uint64_t X[SKEIN_512_STATE_WORDS];
-	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
+	Skein_512_Ctxt_t* ctx = (Skein_512_Ctxt_t*)&state->u.ctx_512;
 	//size_t tmp;
 	//uint8_t *p8;
 	//uint64_t *p64;
 
 	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
 
-	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+	if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
 	{
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 		//p8 = &ctx->b[ctx->h.bCnt];
 		//tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
 		//for( i = 0; i < tmp; i++ ) *(p8+i) = 0;
 	}
 
-	cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt);
+	cn_skein512_processblock(ctx, ctx->b, 1, ctx->h.bCnt);
 
 	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
 
 	//uint8_t  b[SKEIN_512_BLOCK_BYTES] == 64
-	memset(ctx->b,0,sizeof(ctx->b));
+	memset(ctx->b, 0, sizeof(ctx->b));
 	//p64 = (uint64_t *)ctx->b;
 	//for( i = 0; i < 8; i++ ) *(p64+i) = 0;
 
-	memcpy(X,ctx->X,sizeof(X));
+	memcpy(X, ctx->X, sizeof(X));
 
-	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+	for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++)
 	{
-		((uint64_t *)ctx->b)[0]= (uint64_t)i;
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t));
-		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;
-		if (n >= SKEIN_512_BLOCK_BYTES)
-		n  = SKEIN_512_BLOCK_BYTES;
-		memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		((uint64_t*)ctx->b)[0] = (uint64_t)i;
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		cn_skein512_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if(n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		memcpy(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
 }
 
-__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt)
+__device__ void cn_skein512_update(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ msg, size_t msgByteCnt)
 {
 	size_t n;
 
-	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
 	{
 
-		if (ctx->h.bCnt)
+		if(ctx->h.bCnt)
 		{
 
 			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
 
-			if (n)
+			if(n)
 			{
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
 			}
 
-			cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+			cn_skein512_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
 		}
 
-		if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+		if(msgByteCnt > SKEIN_512_BLOCK_BYTES)
 		{
-			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;
-			cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
+			cn_skein512_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
-			msg        += n * SKEIN_512_BLOCK_BYTES;
+			msg += n * SKEIN_512_BLOCK_BYTES;
 		}
 	}
 
-	if (msgByteCnt)
+	if(msgByteCnt)
 	{
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
 	}
 }
 
-__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+__device__ void cn_skein_update(skeinHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen)
 {
-	if ((databitlen & 7) == 0)
+	if((databitlen & 7) == 0)
 	{
-		cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3);
+		cn_skein512_update(&state->u.ctx_512, data, databitlen >> 3);
 	}
 	else
 	{
 
 		size_t bCnt = (databitlen >> 3) + 1;
-		uint8_t b,mask;
+		uint8_t b, mask;
 
-		mask = (uint8_t) (1u << (7 - (databitlen & 7)));
-		b    = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask);
+		mask = (uint8_t)(1u << (7 - (databitlen & 7)));
+		b = (uint8_t)((data[bCnt - 1] & (0 - mask)) | mask);
 
-		cn_skein512_update(&state->u.ctx_512,data,bCnt-1);
-		cn_skein512_update(&state->u.ctx_512,&b  ,  1   );
+		cn_skein512_update(&state->u.ctx_512, data, bCnt - 1);
+		cn_skein512_update(&state->u.ctx_512, &b, 1);
 
 		Skein_Set_Bit_Pad_Flag(state->u.h);
 	}
 }
 
-__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+__device__ void cn_skein(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval)
 {
 	int hashbitlen = 256;
 	DataLength databitlen = len << 3;
 	skeinHashState state;
 
-	state.statebits = 64*SKEIN_512_STATE_WORDS;
+	state.statebits = 64 * SKEIN_512_STATE_WORDS;
 
 	cn_skein_init(&state, hashbitlen);
 	cn_skein_update(&state, data, databitlen);
diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp
index 5c7dfe16a..560507691 100644
--- a/xmrstak/backend/plugin.hpp
+++ b/xmrstak/backend/plugin.hpp
@@ -3,22 +3,22 @@
 #include "xmrstak/misc/environment.hpp"
 #include "xmrstak/params.hpp"
 
-#include <thread>
-#include <atomic>
-#include <vector>
-#include <string>
 #include "iBackend.hpp"
+#include <atomic>
 #include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
 
 #ifndef USE_PRECOMPILED_HEADERS
-#	ifdef WIN32
-#		include <direct.h>
-#		include <windows.h>
-#	else
-#		include <sys/types.h>
-#		include <dlfcn.h>
-#	endif
-#	include <iostream>
+#ifdef WIN32
+#include <direct.h>
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#include <sys/types.h>
+#endif
+#include <iostream>
 #endif
 
 namespace xmrstak
@@ -36,41 +36,41 @@ struct plugin
 		libBackend = LoadLibrary(TEXT((libName + ".dll").c_str()));
 		if(!libBackend)
 		{
-			std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << (libName + ".dll") << std::endl;
+			std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << (libName + ".dll") << std::endl;
 			return;
 		}
 #else
 		// `.so` linux file extention for dynamic libraries
 		std::string fileExtension = ".so";
-#	if defined(__APPLE__)
+#if defined(__APPLE__)
 		// `.dylib` Mac OS X file extention for dynamic libraries
 		fileExtension = ".dylib";
-#	endif
+#endif
 		// search library in working directory
-		libBackend = dlopen(("./lib" + libName + fileExtension).c_str(), RTLD_LAZY);
+		libBackend = dlopen(("./lib" + libName + fileExtension).c_str(), RTLD_NOW | RTLD_LAZY | RTLD_GLOBAL);
 		// fallback to binary directory
 		if(!libBackend)
-			libBackend = dlopen((params::inst().executablePrefix + "lib" + libName + fileExtension).c_str(), RTLD_LAZY);
+			libBackend = dlopen((params::inst().executablePrefix + "lib" + libName + fileExtension).c_str(), RTLD_NOW | RTLD_LAZY | RTLD_GLOBAL);
 		// try use LD_LIBRARY_PATH
 		if(!libBackend)
-			libBackend = dlopen(("lib" + libName + fileExtension).c_str(), RTLD_LAZY);
+			libBackend = dlopen(("lib" + libName + fileExtension).c_str(), RTLD_NOW | RTLD_LAZY | RTLD_GLOBAL);
 		if(!libBackend)
 		{
-			std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << dlerror() << std::endl;
+			std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << dlerror() << std::endl;
 			return;
 		}
 #endif
 
 #ifdef WIN32
-		fn_startBackend = (startBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend");
-		if (!fn_startBackend)
+		fn_startBackend = (startBackend_t)GetProcAddress(libBackend, "xmrstak_start_backend");
+		if(!fn_startBackend)
 		{
-			std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " <<GetLastError()<< std::endl;
+			std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " << GetLastError() << std::endl;
 		}
 #else
 		// reset last error
 		dlerror();
-		fn_startBackend = (startBackend_t) dlsym(libBackend, "xmrstak_start_backend");
+		fn_startBackend = (startBackend_t)dlsym(libBackend, "xmrstak_start_backend");
 		const char* dlsym_error = dlerror();
 		if(dlsym_error)
 		{
@@ -112,7 +112,7 @@ struct plugin
 #ifdef WIN32
 	HINSTANCE libBackend;
 #else
-	void *libBackend = nullptr;
+	void* libBackend = nullptr;
 #endif
 };
 
diff --git a/xmrstak/backend/pool_data.hpp b/xmrstak/backend/pool_data.hpp
index 4e92359ec..632fc40ec 100644
--- a/xmrstak/backend/pool_data.hpp
+++ b/xmrstak/backend/pool_data.hpp
@@ -11,9 +11,11 @@ namespace xmrstak
 struct pool_data
 {
 	uint32_t iSavedNonce;
-	size_t   pool_id;
+	size_t pool_id;
 
-	pool_data() : iSavedNonce(0), pool_id(invalid_pool_id)
+	pool_data() :
+		iSavedNonce(0),
+		pool_id(invalid_pool_id)
 	{
 	}
 };
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index 7b974f669..607e863e1 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -1,4 +1,4 @@
- /*
+/*
   * This program is free software: you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation, either version 3 of the License, or
@@ -21,38 +21,37 @@
   *
   */
 
-#include "xmrstak/misc/executor.hpp"
-#include "xmrstak/backend/miner_work.hpp"
-#include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/backend/backendConnector.hpp"
+#include "xmrstak/backend/globalStates.hpp"
+#include "xmrstak/backend/miner_work.hpp"
+#include "xmrstak/donate-level.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/misc/console.hpp"
-#include "xmrstak/donate-level.hpp"
+#include "xmrstak/misc/executor.hpp"
+#include "xmrstak/misc/utility.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/version.hpp"
-#include "xmrstak/misc/utility.hpp"
 
 #ifndef CONF_NO_HTTPD
-#	include "xmrstak/http/httpd.hpp"
+#include "xmrstak/http/httpd.hpp"
 #endif
 
-#include <stdlib.h>
+#include <iostream>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string>
-#include <iostream>
 #include <time.h>
-#include <iostream>
 
 #ifndef CONF_NO_TLS
-#include <openssl/ssl.h>
 #include <openssl/err.h>
+#include <openssl/ssl.h>
 #endif
 
 #ifdef _WIN32
-#	define strcasecmp _stricmp
-#	include <windows.h>
-#	include "xmrstak/misc/uac.hpp"
+#define strcasecmp _stricmp
+#include "xmrstak/misc/uac.hpp"
+#include <windows.h>
 #endif // _WIN32
 
 int do_benchmark(int block_version, int wait_sec, int work_sec);
@@ -62,72 +61,79 @@ void help()
 	using namespace std;
 	using namespace xmrstak;
 
-	cout<<"Usage: "<<params::inst().binaryName<<" [OPTION]..."<<endl;
-	cout<<" "<<endl;
-	cout<<"  -h, --help                 show this help"<<endl;
-	cout<<"  -v, --version              show version number"<<endl;
-	cout<<"  -V, --version-long         show long version number"<<endl;
-	cout<<"  -c, --config FILE          common miner configuration file"<<endl;
-	cout<<"  -C, --poolconf FILE        pool configuration file"<<endl;
+	cout << "Usage: " << params::inst().binaryName << " [OPTION]..." << endl;
+	cout << " " << endl;
+	cout << "  -h, --help                 show this help" << endl;
+	cout << "  -v, --version              show version number" << endl;
+	cout << "  -V, --version-long         show long version number" << endl;
+	cout << "  -c, --config FILE          common miner configuration file" << endl;
+	cout << "  -C, --poolconf FILE        pool configuration file" << endl;
 #ifdef _WIN32
-	cout<<"  --noUAC                    disable the UAC dialog"<<endl;
+	cout << "  --noUAC                    disable the UAC dialog" << endl;
 #endif
-	cout<<"  --benchmark BLOCKVERSION   ONLY do a benchmark and exit"<<endl;
-	cout<<"  --benchwait WAIT_SEC             ... benchmark wait time"<<endl;
-	cout<<"  --benchwork WORK_SEC             ... benchmark work time"<<endl;
+	cout << "  --benchmark BLOCKVERSION   ONLY do a benchmark and exit" << endl;
+	cout << "  --benchwait WAIT_SEC             ... benchmark wait time" << endl;
+	cout << "  --benchwork WORK_SEC             ... benchmark work time" << endl;
 #ifndef CONF_NO_CPU
-	cout<<"  --noCPU                    disable the CPU miner backend"<<endl;
-	cout<<"  --cpu FILE                 CPU backend miner config file"<<endl;
+	cout << "  --noCPU                    disable the CPU miner backend" << endl;
+	cout << "  --cpu FILE                 CPU backend miner config file" << endl;
 #endif
 #ifndef CONF_NO_OPENCL
-	cout<<"  --noAMD                    disable the AMD miner backend"<<endl;
-	cout<<"  --noAMDCache               disable the AMD(OpenCL) cache for precompiled binaries"<<endl;
-	cout<<"  --openCLVendor VENDOR      use OpenCL driver of VENDOR and devices [AMD,NVIDIA]"<<endl;
-	cout<<"                             default: AMD"<<endl;
-	cout<<"  --amdCacheDir DIRECTORY    directory to store AMD binary files"<<endl;
-	cout<<"  --amd FILE                 AMD backend miner config file"<<endl;
+	cout << "  --noAMD                    disable the AMD miner backend" << endl;
+	cout << "  --amdGpus GPUS             indices of AMD GPUs to use. Example: 0,2,3" << endl;
+	cout << "  --noAMDCache               disable the AMD(OpenCL) cache for precompiled binaries" << endl;
+	cout << "  --openCLVendor VENDOR      use OpenCL driver of VENDOR and devices [AMD,NVIDIA]" << endl;
+	cout << "                             default: AMD" << endl;
+	cout << "  --amdCacheDir DIRECTORY    directory to store AMD binary files" << endl;
+	cout << "  --amd FILE                 AMD backend miner config file" << endl;
 #endif
 #ifndef CONF_NO_CUDA
-	cout<<"  --noNVIDIA                 disable the NVIDIA miner backend"<<endl;
-	cout<<"  --nvidia FILE              NVIDIA backend miner config file"<<endl;
+	cout << "  --noNVIDIA                 disable the NVIDIA miner backend" << endl;
+	cout << "  --nvidiaGpus GPUS          indices of NVIDIA GPUs to use. Example: 0,2,3" << endl;
+	cout << "  --nvidia FILE              NVIDIA backend miner config file" << endl;
 #endif
+	cout << "  --log FILE                 miner output file" << endl;
+	cout << "  --h-print-time SEC         interval for printing hashrate, in seconds" << endl;
 #ifndef CONF_NO_HTTPD
-	cout<<"  -i --httpd HTTP_PORT       HTTP interface port"<<endl;
+	cout << "  -i --httpd HTTP_PORT       HTTP interface port" << endl;
 #endif
-	cout<<" "<<endl;
-	cout<<"The following options can be used for automatic start without a guided config,"<<endl;
-	cout<<"If config exists then this pool will be top priority."<<endl;
-	cout<<"  -o, --url URL              pool url and port, e.g. pool.usxmrpool.com:3333"<<endl;
-	cout<<"  -O, --tls-url URL          TLS pool url and port, e.g. pool.usxmrpool.com:10443"<<endl;
-	cout<<"  -u, --user USERNAME        pool user name or wallet address"<<endl;
-	cout<<"  -r, --rigid RIGID          rig identifier for pool-side statistics (needs pool support)"<<endl;
-	cout<<"  -p, --pass PASSWD          pool password, in the most cases x or empty \"\""<<endl;
-	cout<<"  --use-nicehash             the pool should run in nicehash mode"<<endl;
-	cout<<"  --currency NAME            currency to mine"<<endl;
-	cout<< endl;
+	cout << " " << endl;
+	cout << "The following options can be used for automatic start without a guided config," << endl;
+	cout << "If config exists then this pool will be top priority." << endl;
+	cout << "  -o, --url URL              pool url and port, e.g. pool.usxmrpool.com:3333" << endl;
+	cout << "  -O, --tls-url URL          TLS pool url and port, e.g. pool.usxmrpool.com:10443" << endl;
+	cout << "  -u, --user USERNAME        pool user name or wallet address" << endl;
+	cout << "  -r, --rigid RIGID          rig identifier for pool-side statistics (needs pool support)" << endl;
+	cout << "  -p, --pass PASSWD          pool password, in the most cases x or empty \"\"" << endl;
+	cout << "  --use-nicehash             the pool should run in nicehash mode" << endl;
+	cout << "  --currency NAME            currency to mine" << endl;
+	cout << endl;
 #ifdef _WIN32
-	cout<<"Environment variables:\n"<<endl;
-	cout<<"  XMRSTAK_NOWAIT             disable the dialog `Press any key to exit."<<std::endl;
-	cout<<"                	            for non UAC execution"<<endl;
-	cout<< endl;
+	cout << "Environment variables:\n"
+		 << endl;
+	cout << "  XMRSTAK_NOWAIT             disable the dialog `Press any key to exit." << std::endl;
+	cout << "                	            for non UAC execution" << endl;
+	cout << endl;
 #endif
 	std::string algos;
 	jconf::GetAlgoList(algos);
-	cout<< "Supported coin options: " << endl << algos << endl;
-	cout<< "Version: " << get_version_str_short() << endl;
-	cout<<"Brought to by fireice_uk and psychocrypt under GPLv3."<<endl;
+	cout << "Supported coin options: " << endl
+		 << algos << endl;
+	cout << "Version: " << get_version_str_short() << endl;
+	cout << "Brought to by fireice_uk and psychocrypt under GPLv3." << endl;
 }
 
-bool read_yes_no(const char* str)
+bool read_yes_no(const char* str, std::string default_value = "")
 {
 	std::string tmp;
 	do
 	{
 		std::cout << str << std::endl;
-		std::cin >> tmp;
+		getline(std::cin, tmp);
+		if(tmp.empty())
+			tmp = default_value;
 		std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
-	}
-	while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no");
+	} while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no");
 
 	return tmp == "y" || tmp == "yes";
 }
@@ -139,34 +145,37 @@ inline const char* bool_to_str(bool v)
 
 std::string get_multipool_entry(bool& final)
 {
-	std::cout<<std::endl<<"- Next Pool:"<<std::endl<<std::endl;
+	std::cout << std::endl
+			  << "- Next Pool:" << std::endl
+			  << std::endl;
 
 	std::string pool;
-	std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
+	std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
 	std::cin >> pool;
 
 	std::string userName;
-	std::cout<<"- Username (wallet address or pool login):"<<std::endl;
+	std::cout << "- Username (wallet address or pool login):" << std::endl;
 	std::cin >> userName;
 
 	std::string passwd;
-	std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
-	std::cout<<"- Password (mostly empty or x):"<<std::endl;
+	std::cin.clear();
+	std::cin.ignore(INT_MAX, '\n');
+	std::cout << "- Password (mostly empty or x):" << std::endl;
 	getline(std::cin, passwd);
 
 	std::string rigid;
-	std::cout<<"- Rig identifier for pool-side statistics (needs pool support). Can be empty:"<<std::endl;
+	std::cout << "- Rig identifier for pool-side statistics (needs pool support). Can be empty:" << std::endl;
 	getline(std::cin, rigid);
 
 #ifdef CONF_NO_TLS
 	bool tls = false;
 #else
-	bool tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)");
+	bool tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)", "N");
 #endif
-	bool nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/n)");
+	bool nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/N)", "N");
 
 	int64_t pool_weight;
-	std::cout << "- Please enter a weight for this pool: "<<std::endl;
+	std::cout << "- Please enter a weight for this pool: " << std::endl;
 	while(!(std::cin >> pool_weight) || pool_weight <= 0)
 	{
 		std::cin.clear();
@@ -174,36 +183,43 @@ std::string get_multipool_entry(bool& final)
 		std::cout << "Invalid weight.  Try 1, 10, 100, etc:" << std::endl;
 	}
 
-	final = !read_yes_no("- Do you want to add another pool? (y/n)");
+	final = !read_yes_no("- Do you want to add another pool? (y/N)", "N");
 
-	return "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid +
-		"\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
-		bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
+	return "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid +
+		   "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
+		   bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
 }
 
 inline void prompt_once(bool& prompted)
 {
 	if(!prompted)
 	{
-		std::cout<<"Please enter:"<<std::endl;
+		std::cout << "Please enter:" << std::endl;
 		prompted = true;
 	}
 }
 
+inline bool use_simple_start()
+{
+	// ask this question only once
+	static bool simple_start = read_yes_no("\nUse simple setup method? (Y/n)", "Y");
+	return simple_start;
+}
+
 void do_guided_pool_config()
 {
 	using namespace xmrstak;
 
 	// load the template of the backend config into a char variable
-	const char *tpl =
-		#include "../pools.tpl"
-	;
+	const char* tpl =
+#include "../pools.tpl"
+		;
 
 	configEditor configTpl{};
 	configTpl.set(std::string(tpl));
 	bool prompted = false;
 
-	auto& currency = params::inst().currency;
+	auto currency = params::inst().currency;
 	if(currency.empty() || !jconf::IsOnAlgoList(currency))
 	{
 		prompt_once(prompted);
@@ -213,97 +229,101 @@ void do_guided_pool_config()
 		{
 			std::string list;
 			jconf::GetAlgoList(list);
-			std::cout << "- Please enter the currency that you want to mine: "<<std::endl;
+			std::cout << "- Please enter the currency that you want to mine: " << std::endl;
 			std::cout << list << std::endl;
 			std::cin >> tmp;
 		}
 		currency = tmp;
 	}
 
-	auto& pool = params::inst().poolURL;
+	auto pool = params::inst().poolURL;
 	bool userSetPool = true;
 	if(pool.empty())
 	{
 		prompt_once(prompted);
 
 		userSetPool = false;
-		std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
+		std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
 		std::cin >> pool;
 	}
 
-	auto& userName = params::inst().poolUsername;
+	auto userName = params::inst().poolUsername;
 	if(userName.empty())
 	{
 		prompt_once(prompted);
 
-		std::cout<<"- Username (wallet address or pool login):"<<std::endl;
+		std::cout << "- Username (wallet address or pool login):" << std::endl;
 		std::cin >> userName;
 	}
 
 	bool stdin_flushed = false;
-	auto& passwd = params::inst().poolPasswd;
+	auto passwd = params::inst().poolPasswd;
 	if(passwd.empty() && !params::inst().userSetPwd)
 	{
 		prompt_once(prompted);
 
 		// clear everything from stdin to allow an empty password
-		std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
+		std::cin.clear();
+		std::cin.ignore(INT_MAX, '\n');
 		stdin_flushed = true;
 
-		std::cout<<"- Password (mostly empty or x):"<<std::endl;
+		std::cout << "- Password (mostly empty or x):" << std::endl;
 		getline(std::cin, passwd);
 	}
 
-	auto& rigid = params::inst().poolRigid;
+	auto rigid = params::inst().poolRigid;
 	if(rigid.empty() && !params::inst().userSetRigid)
 	{
-		prompt_once(prompted);
-
-		if(!stdin_flushed)
+		if(!use_simple_start())
 		{
-			// clear everything from stdin to allow an empty rigid
-			std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
-		}
+			prompt_once(prompted);
 
-		std::cout<<"- Rig identifier for pool-side statistics (needs pool support). Can be empty:"<<std::endl;
-		getline(std::cin, rigid);
+			if(!stdin_flushed)
+			{
+				// clear everything from stdin to allow an empty rigid
+				std::cin.clear();
+				std::cin.ignore(INT_MAX, '\n');
+			}
+
+			std::cout << "- Rig identifier for pool-side statistics (needs pool support). Can be empty:" << std::endl;
+			getline(std::cin, rigid);
+		}
 	}
 
-	bool tls;
+	bool tls = params::inst().poolUseTls;
 #ifdef CONF_NO_TLS
 	tls = false;
 #else
 	if(!userSetPool)
 	{
 		prompt_once(prompted);
-		tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)");
+		tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)", "N");
 	}
-	else
-		tls = params::inst().poolUseTls;
+
 #endif
 
-	bool nicehash;
+	bool nicehash = params::inst().nicehashMode;
 	if(!userSetPool)
 	{
-		prompt_once(prompted);
-		nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/n)");
+		if(!use_simple_start())
+		{
+			prompt_once(prompted);
+			nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/N)", "N");
+		}
 	}
-	else
-		nicehash = params::inst().nicehashMode;
 
-	bool multipool;
+	bool multipool = false;
 	if(!userSetPool)
-		multipool = read_yes_no("- Do you want to use multiple pools? (y/n)");
-	else
-		multipool = false;
+		if(!use_simple_start())
+			multipool = read_yes_no("- Do you want to use multiple pools? (y/N)", "N");
 
-	int64_t pool_weight;
+	int64_t pool_weight = 1;
 	if(multipool)
 	{
 		std::cout << "Pool weight is a number telling the miner how important the pool is." << std::endl;
 		std::cout << "Miner will mine mostly at the pool with the highest weight, unless the pool fails." << std::endl;
 		std::cout << "Weight must be an integer larger than 0." << std::endl;
-		std::cout << "- Please enter a weight for this pool: "<<std::endl;
+		std::cout << "- Please enter a weight for this pool: " << std::endl;
 
 		while(!(std::cin >> pool_weight) || pool_weight <= 0)
 		{
@@ -312,13 +332,11 @@ void do_guided_pool_config()
 			std::cout << "Invalid weight.  Try 1, 10, 100, etc:" << std::endl;
 		}
 	}
-	else
-		pool_weight = 1;
 
 	std::string pool_table;
-	pool_table += "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName +  "\", \"rig_id\" : \"" + rigid +
-		"\", \"pool_password\" : \"" +  passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
-		bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
+	pool_table += "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid +
+				  "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
+				  bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
 
 	if(multipool)
 	{
@@ -326,14 +344,13 @@ void do_guided_pool_config()
 		do
 		{
 			pool_table += get_multipool_entry(final);
-		}
-		while(!final);
+		} while(!final);
 	}
 
 	configTpl.replace("CURRENCY", currency);
 	configTpl.replace("POOLCONF", pool_table);
 	configTpl.write(params::inst().configFilePools);
-	std::cout<<"Pool configuration stored in file '"<<params::inst().configFilePools<<"'"<<std::endl;
+	std::cout << "Pool configuration stored in file '" << params::inst().configFilePools << "'" << std::endl;
 }
 
 void do_guided_config()
@@ -341,44 +358,47 @@ void do_guided_config()
 	using namespace xmrstak;
 
 	// load the template of the backend config into a char variable
-	const char *tpl =
-		#include "../config.tpl"
-	;
+	const char* tpl =
+#include "../config.tpl"
+		;
 
 	configEditor configTpl{};
 	configTpl.set(std::string(tpl));
 	bool prompted = false;
 
-	auto& http_port = params::inst().httpd_port;
+	auto http_port = params::inst().httpd_port;
 	if(http_port == params::httpd_port_unset)
 	{
-#if defined(CONF_NO_HTTPD)
 		http_port = params::httpd_port_disabled;
-#else
-		prompt_once(prompted);
+#ifndef CONF_NO_HTTPD
+		if(!use_simple_start())
+		{
+			prompt_once(prompted);
 
-		std::cout<<"- Do you want to use the HTTP interface?" <<std::endl;
-		std::cout<<"Unlike the screen display, browser interface is not affected by the GPU lag." <<std::endl;
-		std::cout<<"If you don't want to use it, please enter 0, otherwise enter port number that the miner should listen on" <<std::endl;
+			std::cout << "- Do you want to use the HTTP interface?" << std::endl;
+			std::cout << "Unlike the screen display, browser interface is not affected by the GPU lag." << std::endl;
+			std::cout << "If you don't want to use it, please enter 0, otherwise enter port number that the miner should listen on" << std::endl;
 
-		int32_t port;
-		while(!(std::cin >> port) || port < 0 || port > 65535)
-		{
-			std::cin.clear();
-			std::cin.ignore(INT_MAX, '\n');
-			std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl;
+			int32_t port;
+			while(!(std::cin >> port) || port < 0 || port > 65535)
+			{
+				std::cin.clear();
+				std::cin.ignore(INT_MAX, '\n');
+				std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl;
+			}
+			http_port = port;
 		}
-
-		http_port = port;
 #endif
 	}
 
 	configTpl.replace("HTTP_PORT", std::to_string(http_port));
+	configTpl.replace("OUTPUT_FILE", params::inst().outputFile);
+	configTpl.replace("H_PRINT_TIME", std::to_string(params::inst().h_print_time > 0 ? params::inst().h_print_time : 300));
 	configTpl.write(params::inst().configFile);
-	std::cout<<"Configuration stored in file '"<<params::inst().configFile<<"'"<<std::endl;
+	std::cout << "Configuration stored in file '" << params::inst().configFile << "'" << std::endl;
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
 #ifndef CONF_NO_TLS
 	SSL_library_init();
@@ -419,7 +439,7 @@ int main(int argc, char *argv[])
 	}
 
 	bool pool_url_set = false;
-	for(size_t i = 1; i < argc-1; i++)
+	for(size_t i = 1; i < argc - 1; i++)
 	{
 		std::string opName(argv[i]);
 		if(opName == "-o" || opName == "-O" || opName == "--url" || opName == "--tls-url")
@@ -437,13 +457,13 @@ int main(int argc, char *argv[])
 		}
 		if(opName.compare("-v") == 0 || opName.compare("--version") == 0)
 		{
-			std::cout<< "Version: " << get_version_str_short() << std::endl;
+			std::cout << "Version: " << get_version_str_short() << std::endl;
 			win_exit();
 			return 0;
 		}
 		else if(opName.compare("-V") == 0 || opName.compare("--version-long") == 0)
 		{
-			std::cout<< "Version: " << get_version_str() << std::endl;
+			std::cout << "Version: " << get_version_str() << std::endl;
 			win_exit();
 			return 0;
 		}
@@ -455,10 +475,21 @@ int main(int argc, char *argv[])
 		{
 			params::inst().useAMD = false;
 		}
+		else if (opName.compare("--amdGpus") == 0)
+		{
+			++i;
+			if (i >= argc)
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--amdGpus' given");
+				win_exit();
+				return 1;
+			}
+			params::inst().amdGpus = argv[i];
+		}
 		else if(opName.compare("--openCLVendor") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--openCLVendor' given");
 				win_exit();
@@ -481,10 +512,21 @@ int main(int argc, char *argv[])
 		{
 			params::inst().useNVIDIA = false;
 		}
+		else if (opName.compare("--nvidiaGpus") == 0)
+		{
+			++i;
+			if (i >= argc)
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--nvidiaGpus' given");
+				win_exit();
+				return 1;
+			}
+			params::inst().nvidiaGpus = argv[i];
+		}
 		else if(opName.compare("--cpu") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--cpu' given");
 				win_exit();
@@ -495,7 +537,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--amd") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--amd' given");
 				win_exit();
@@ -506,7 +548,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--amdCacheDir") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--amdCacheDir' given");
 				win_exit();
@@ -517,7 +559,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--nvidia") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--nvidia' given");
 				win_exit();
@@ -528,7 +570,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--currency") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--currency' given");
 				win_exit();
@@ -539,7 +581,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-o") == 0 || opName.compare("--url") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-o/--url' given");
 				win_exit();
@@ -551,7 +593,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-O") == 0 || opName.compare("--tls-url") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-O/--tls-url' given");
 				win_exit();
@@ -570,7 +612,7 @@ int main(int argc, char *argv[])
 			}
 
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-u/--user' given");
 				win_exit();
@@ -588,7 +630,7 @@ int main(int argc, char *argv[])
 			}
 
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-p/--pass' given");
 				win_exit();
@@ -607,7 +649,7 @@ int main(int argc, char *argv[])
 			}
 
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-r/--rigid' given");
 				win_exit();
@@ -624,7 +666,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-c") == 0 || opName.compare("--config") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-c/--config' given");
 				win_exit();
@@ -635,7 +677,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-C") == 0 || opName.compare("--poolconf") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-C/--poolconf' given");
 				win_exit();
@@ -643,10 +685,40 @@ int main(int argc, char *argv[])
 			}
 			params::inst().configFilePools = argv[i];
 		}
+		else if(opName.compare("--log") == 0)
+		{
+			++i;
+			if(i >= argc)
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--log' given");
+				win_exit();
+				return 1;
+			}
+			params::inst().outputFile = argv[i];
+		}
+		else if (opName.compare("--h-print-time") == 0)
+		{
+			++i;
+			if (i >= argc)
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--h-print-time' given");
+				win_exit();
+				return 1;
+			}
+			char* h_print_time = nullptr;
+			long int time = strtol(argv[i], &h_print_time, 10);
+
+			if (time <= 0)
+			{
+				printer::inst()->print_msg(L0, "Hashrate print time must be > 0");
+				return 1;
+			}
+			params::inst().h_print_time = time;
+		}
 		else if(opName.compare("-i") == 0 || opName.compare("--httpd") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-i/--httpd' given");
 				win_exit();
@@ -672,7 +744,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--benchmark") == 0)
 		{
 			++i;
-			if( i >= argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--benchmark' given");
 				win_exit();
@@ -691,7 +763,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--benchwait") == 0)
 		{
 			++i;
-			if( i >= argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--benchwait' given");
 				win_exit();
@@ -710,7 +782,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--benchwork") == 0)
 		{
 			++i;
-			if( i >= argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--benchwork' given");
 				win_exit();
@@ -728,17 +800,20 @@ int main(int argc, char *argv[])
 		}
 		else
 		{
-			printer::inst()->print_msg(L0, "Parameter unknown '%s'",argv[i]);
+			printer::inst()->print_msg(L0, "Parameter unknown '%s'", argv[i]);
 			win_exit();
 			return 1;
 		}
 	}
 
+	bool hasConfigFile = configEditor::file_exist(params::inst().configFile);
+	bool hasPoolConfig = configEditor::file_exist(params::inst().configFilePools);
+
 	// check if we need a guided start
-	if(!configEditor::file_exist(params::inst().configFile))
+	if(!hasConfigFile)
 		do_guided_config();
 
-	if(!configEditor::file_exist(params::inst().configFilePools))
+	if(!hasPoolConfig)
 		do_guided_pool_config();
 
 	if(!jconf::inst()->parse_config(params::inst().configFile.c_str(), params::inst().configFilePools.c_str()))
@@ -759,7 +834,7 @@ int main(int argc, char *argv[])
 	if(strlen(jconf::inst()->GetOutputFile()) != 0)
 		printer::inst()->open_logfile(jconf::inst()->GetOutputFile());
 
-	if (!BackendConnector::self_test())
+	if(!BackendConnector::self_test())
 	{
 		printer::inst()->print_msg(L0, "Self test not passed!");
 		win_exit();
@@ -773,7 +848,7 @@ int main(int argc, char *argv[])
 		win_exit();
 		return 1;
 #else
-		if (!httpd::inst()->start_daemon())
+		if(!httpd::inst()->start_daemon())
 		{
 			win_exit();
 			return 1;
@@ -847,7 +922,7 @@ int main(int argc, char *argv[])
 		uint64_t currentTime = get_timestamp_ms();
 
 		/* Hard guard to make sure we never get called more than twice per second */
-		if( currentTime - lastTime < 500)
+		if(currentTime - lastTime < 500)
 			std::this_thread::sleep_for(std::chrono::milliseconds(500 - (currentTime - lastTime)));
 		lastTime = currentTime;
 	}
@@ -862,8 +937,14 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 
 	printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version);
 
+	if(block_version <= 0)
+	{
+		printer::inst()->print_msg(L0, "Block version must be >0, current value is %u.", block_version);
+		return 1;
+	}
+
 	uint8_t work[128];
-	memset(work,0,128);
+	memset(work, 0, 128);
 	work[0] = static_cast<uint8_t>(block_version);
 
 	xmrstak::pool_data dat;
@@ -871,20 +952,20 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 	xmrstak::miner_work oWork = xmrstak::miner_work();
 	pvThreads = xmrstak::BackendConnector::thread_starter(oWork);
 
-	printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec);
+	printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized", wait_sec);
 	std::this_thread::sleep_for(std::chrono::seconds(wait_sec));
 
 	/* AMD and NVIDIA is currently only supporting work sizes up to 128byte
 	 */
-	printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec);
-	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);
+	printer::inst()->print_msg(L0, "Start a %d second benchmark...", work_sec);
+	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 1, 0), dat);
 	uint64_t iStartStamp = get_timestamp_ms();
 
 	std::this_thread::sleep_for(std::chrono::seconds(work_sec));
 	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);
 
 	double fTotalHps = 0.0;
-	for (uint32_t i = 0; i < pvThreads->size(); i++)
+	for(uint32_t i = 0; i < pvThreads->size(); i++)
 	{
 		double fHps = pvThreads->at(i)->iHashCount;
 		fHps /= (pvThreads->at(i)->iTimestamp - iStartStamp) / 1000.0;
@@ -892,7 +973,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 		auto bType = static_cast<xmrstak::iBackend::BackendType>(pvThreads->at(i)->backendType);
 		std::string name(xmrstak::iBackend::getName(bType));
 
-		printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i,name.c_str(), fHps);
+		printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i, name.c_str(), fHps);
 		fTotalHps += fHps;
 	}
 
diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl
index d8fd861a7..27b12c52f 100644
--- a/xmrstak/config.tpl
+++ b/xmrstak/config.tpl
@@ -43,7 +43,7 @@ R"===(// generated by XMRSTAK_VERSION
  * h_print_time - How often, in seconds, should we print a hashrate report if verbose_level is set to 4.
  *                This option has no effect if verbose_level is not 4.
  */
-"h_print_time" : 300,
+"h_print_time" : H_PRINT_TIME,
 
 /*
  * Manual hardware AES override
@@ -129,7 +129,7 @@ R"===(// generated by XMRSTAK_VERSION
  * output_file  - This option will log all output to a file.
  *
  */
-"output_file" : "",
+"output_file" : "OUTPUT_FILE",
 
 /*
  * Built-in web server
diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp
index ed9abc2bc..b4f0f547e 100644
--- a/xmrstak/http/httpd.cpp
+++ b/xmrstak/http/httpd.cpp
@@ -23,16 +23,15 @@
 
 #ifndef CONF_NO_HTTPD
 
-
 #include "httpd.hpp"
 #include "webdesign.hpp"
-#include "xmrstak/net/msgstruct.hpp"
+#include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/executor.hpp"
-#include "xmrstak/jconf.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <string>
 
@@ -45,21 +44,20 @@ httpd* httpd::oInst = nullptr;
 
 httpd::httpd()
 {
-
 }
 
-int httpd::req_handler(void * cls,
-			MHD_Connection* connection,
-			const char* url,
-			const char* method,
-			const char* version,
-			const char* upload_data,
-			size_t* upload_data_size,
-			void ** ptr)
+int httpd::req_handler(void* cls,
+	MHD_Connection* connection,
+	const char* url,
+	const char* method,
+	const char* version,
+	const char* upload_data,
+	size_t* upload_data_size,
+	void** ptr)
 {
-	struct MHD_Response * rsp;
+	struct MHD_Response* rsp;
 
-	if (strcmp(method, "GET") != 0)
+	if(strcmp(method, "GET") != 0)
 		return MHD_NO;
 
 	if(strlen(jconf::inst()->GetHttpUsername()) != 0)
@@ -68,7 +66,7 @@ int httpd::req_handler(void * cls,
 		int ret;
 
 		username = MHD_digest_auth_get_username(connection);
-		if (username == NULL)
+		if(username == NULL)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
 			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO);
@@ -78,7 +76,7 @@ int httpd::req_handler(void * cls,
 		free(username);
 
 		ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
-		if (ret == MHD_INVALID_NONCE || ret == MHD_NO)
+		if(ret == MHD_INVALID_NONCE || ret == MHD_NO)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
 			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
@@ -174,4 +172,3 @@ bool httpd::start_daemon()
 }
 
 #endif
-
diff --git a/xmrstak/http/httpd.hpp b/xmrstak/http/httpd.hpp
index fe534f038..dfad082ca 100644
--- a/xmrstak/http/httpd.hpp
+++ b/xmrstak/http/httpd.hpp
@@ -7,27 +7,28 @@ struct MHD_Connection;
 
 class httpd
 {
-public:
+  public:
 	static httpd* inst()
 	{
-		if (oInst == nullptr) oInst = new httpd;
+		if(oInst == nullptr)
+			oInst = new httpd;
 		return oInst;
 	};
 
 	bool start_daemon();
 
-private:
+  private:
 	httpd();
 	static httpd* oInst;
 
-	static int req_handler(void * cls,
-			MHD_Connection* connection,
-			const char* url,
-			const char* method,
-			const char* version,
-			const char* upload_data,
-			size_t* upload_data_size,
-			void ** ptr);
+	static int req_handler(void* cls,
+		MHD_Connection* connection,
+		const char* url,
+		const char* method,
+		const char* version,
+		const char* upload_data,
+		size_t* upload_data_size,
+		void** ptr);
 
-	MHD_Daemon *d;
+	MHD_Daemon* d;
 };
diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index 8f20078aa..fbd565269 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -1,114 +1,114 @@
 #include <stdlib.h>
 
-extern const char sHtmlCssEtag [] = "00000009";
-extern const char sHtmlCssFile [] =
+extern const char sHtmlCssEtag[] = "00000009";
+extern const char sHtmlCssFile[] =
 	"body {"
-		"font-family: Tahoma, Arial, sans-serif;"
-		"font-size: 80%;"
-		"background-color: rgb(240, 240, 240);"
+	"font-family: Tahoma, Arial, sans-serif;"
+	"font-size: 80%;"
+	"background-color: rgb(240, 240, 240);"
 	"}"
 
 	"a {"
-		"color: rgb(44, 55, 66);"
+	"color: rgb(44, 55, 66);"
 	"}"
 
 	"a:link {"
-		"text-decoration: none;"
+	"text-decoration: none;"
 	"}"
 
 	"a:visited {"
-		"color: rgb(44, 55, 66);"
+	"color: rgb(44, 55, 66);"
 	"}"
 
 	"a:hover {"
-		"color: rgb(255, 153, 0);"
+	"color: rgb(255, 153, 0);"
 	"}"
 
 	"a:active {"
-		"color: rgb(204, 122, 0);"
+	"color: rgb(204, 122, 0);"
 	"}"
 
 	".all {"
-		"max-width:600px;"
-		"margin: auto;"
+	"max-width:600px;"
+	"margin: auto;"
 	"}"
 
 	".header {"
-		"background-color: rgb(30, 30, 30);"
-		"color: white;"
-		"padding: 10px;"
-		"font-weight: bold;"
-		"margin: 0px;"
-		"margin-bottom: 10px;"
+	"background-color: rgb(30, 30, 30);"
+	"color: white;"
+	"padding: 10px;"
+	"font-weight: bold;"
+	"margin: 0px;"
+	"margin-bottom: 10px;"
 	"}"
 
 	".version {"
-		"font-size: 75%;"
-		"text-align: right;"
+	"font-size: 75%;"
+	"text-align: right;"
 	"}"
 
 	".links {"
-		"padding: 7px;"
-		"text-align: center;"
-		"background-color: rgb(215, 215, 215);"
-		"box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);"
+	"padding: 7px;"
+	"text-align: center;"
+	"background-color: rgb(215, 215, 215);"
+	"box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);"
 	"}"
 
 	".data th, td {"
-		"padding: 5px 12px;"
-		"text-align: right;"
-		"border-bottom: 1px solid #ccc;"
+	"padding: 5px 12px;"
+	"text-align: right;"
+	"border-bottom: 1px solid #ccc;"
 	"}"
 
 	".data tr:nth-child(even) {"
-		"background-color: #ddd;"
+	"background-color: #ddd;"
 	"}"
 
 	".data th {"
-		"background-color: #ccc;"
+	"background-color: #ccc;"
 	"}"
 
 	".data table {"
-		"width: 100%;"
-		"max-width: 600px;"
+	"width: 100%;"
+	"max-width: 600px;"
 	"}"
 
 	".letter {"
-		"font-weight: bold;"
+	"font-weight: bold;"
 	"}"
 
 	"h4 {"
-		"background-color: rgb(0, 130, 130);"
-		"color: white;"
-		"padding: 10px;"
-		"margin: 10px 0px;"
+	"background-color: rgb(0, 130, 130);"
+	"color: white;"
+	"padding: 10px;"
+	"margin: 10px 0px;"
 	"}"
 
 	".flex-container {"
-		"display: -webkit-flex;"
-		"display: flex;"
+	"display: -webkit-flex;"
+	"display: flex;"
 	"}"
 
 	".flex-item {"
-		"width: 33%;"
-		"margin: 3px;"
+	"width: 33%;"
+	"margin: 3px;"
 	"}"
 
 	".motd-box {"
-		"background-color: #ccc;"
-		"padding: 0px 10px 5px 10px;"
-		"margin-bottom: 10px;"
+	"background-color: #ccc;"
+	"padding: 0px 10px 5px 10px;"
+	"margin-bottom: 10px;"
 	"}"
 
 	".motd-head {"
-		"border-bottom: 1px solid #000;"
-		"margin-bottom: 0.5em;"
-		"padding: 0.5em 0em;"
-		"font-weight: bold;"
+	"border-bottom: 1px solid #000;"
+	"margin-bottom: 0.5em;"
+	"padding: 0.5em 0em;"
+	"font-weight: bold;"
 	"}"
 
 	".motd-body {"
-		"overflow: hidden;"
+	"overflow: hidden;"
 	"}";
 
 size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1;
@@ -124,7 +124,7 @@ extern const char sHtmlAccessDenied[] =
 
 size_t sHtmlAccessDeniedSize = sizeof(sHtmlAccessDenied) - 1;
 
-extern const char sHtmlCommonHeader [] =
+extern const char sHtmlCommonHeader[] =
 	"<!DOCTYPE html>"
 	"<html>"
 	"<head><meta name='viewport' content='width=device-width' />"
@@ -135,15 +135,15 @@ extern const char sHtmlCommonHeader [] =
 	"<div class='header'><span style='color: rgb(255, 160, 0)'>XMR</span>-Stak Monero Miner</div>"
 
 	"<div class='flex-container'>"
-		"<div class='links flex-item'>"
-			"<a href='h'><div><span class='letter'>H</span>ashrate</div></a>"
-		"</div>"
-		"<div class='links flex-item'>"
-			"<a href='r'><div><span class='letter'>R</span>esults</div></a>"
-		"</div>"
-		"<div class='links flex-item'>"
-			"<a href='c'><div><span class='letter'>C</span>onnection</div></a>"
-		"</div>"
+	"<div class='links flex-item'>"
+	"<a href='h'><div><span class='letter'>H</span>ashrate</div></a>"
+	"</div>"
+	"<div class='links flex-item'>"
+	"<a href='r'><div><span class='letter'>R</span>esults</div></a>"
+	"</div>"
+	"<div class='links flex-item'>"
+	"<a href='c'><div><span class='letter'>C</span>onnection</div></a>"
+	"</div>"
 	"</div>"
 	"<h4>%s</h4>";
 
@@ -151,61 +151,61 @@ extern const char sHtmlMotdBoxStart[] = "<div class='motd-box'>";
 extern const char sHtmlMotdEntry[] = "<div class='motd-head'>Message from %s</div><div class='motd-body'>%s</div>";
 extern const char sHtmlMotdBoxEnd[] = "</div>";
 
-extern const char sHtmlHashrateBodyHigh [] =
+extern const char sHtmlHashrateBodyHigh[] =
 	"<div class='data'>"
 	"<table>"
-		"<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>";
+	"<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>";
 
-extern const char sHtmlHashrateTableRow [] =
+extern const char sHtmlHashrateTableRow[] =
 	"<tr><th>%s</th><td>%s</td><td>%s</td><td>%s</td></tr>";
 
-extern const char sHtmlHashrateBodyLow [] =
-		"<tr><th>Totals:</th><td>%s</td><td>%s</td><td>%s</td></tr>"
-		"<tr><th>Highest:</th><td>%s</td><td colspan='2'></td></tr>"
+extern const char sHtmlHashrateBodyLow[] =
+	"<tr><th>Totals:</th><td>%s</td><td>%s</td><td>%s</td></tr>"
+	"<tr><th>Highest:</th><td>%s</td><td colspan='2'></td></tr>"
 	"</table>"
 	"</div></div></body></html>";
 
-extern const char sHtmlConnectionBodyHigh [] =
+extern const char sHtmlConnectionBodyHigh[] =
 	"<div class='data'>"
 	"<table>"
-		"<tr><th>Rig ID</th><td>%s</td></tr>"
-		"<tr><th>Pool address</th><td>%s</td></tr>"
-		"<tr><th>Connected since</th><td>%s</td></tr>"
-		"<tr><th>Pool ping time</th><td>%u ms</td></tr>"
+	"<tr><th>Rig ID</th><td>%s</td></tr>"
+	"<tr><th>Pool address</th><td>%s</td></tr>"
+	"<tr><th>Connected since</th><td>%s</td></tr>"
+	"<tr><th>Pool ping time</th><td>%u ms</td></tr>"
 	"</table>"
 	"<h4>Network error log</h4>"
 	"<table>"
-		"<tr><th style='width: 20%; min-width: 10em;'>Date</th><th>Error</th></tr>";
+	"<tr><th style='width: 20%; min-width: 10em;'>Date</th><th>Error</th></tr>";
 
-extern const char sHtmlConnectionTableRow [] =
+extern const char sHtmlConnectionTableRow[] =
 	"<tr><td>%s</td><td>%s</td></tr>";
 
-extern const char sHtmlConnectionBodyLow [] =
+extern const char sHtmlConnectionBodyLow[] =
 	"</table></div></div></body></html>";
 
-extern const char sHtmlResultBodyHigh [] =
+extern const char sHtmlResultBodyHigh[] =
 	"<div class='data'>"
 	"<table>"
-		"<tr><th>Currency</th><td>%s</td></tr>"
-		"<tr><th>Difficulty</th><td>%u</td></tr>"
-		"<tr><th>Good results</th><td>%u / %u (%.1f %%)</td></tr>"
-		"<tr><th>Avg result time</th><td>%.1f sec</td></tr>"
-		"<tr><th>Pool-side hashes</th><td>%u</td></tr>"
+	"<tr><th>Currency</th><td>%s</td></tr>"
+	"<tr><th>Difficulty</th><td>%u</td></tr>"
+	"<tr><th>Good results</th><td>%u / %u (%.1f %%)</td></tr>"
+	"<tr><th>Avg result time</th><td>%.1f sec</td></tr>"
+	"<tr><th>Pool-side hashes</th><td>%u</td></tr>"
 	"</table>"
 	"<h4>Top 10 best results found</h4>"
 	"<table>"
-		"<tr><th style='width: 2em;'>1</th><td>%llu</td><th style='width: 2em;'>2</th><td>%llu</td></tr>"
-		"<tr><th>3</th><td>%llu</td><th>4</th><td>%llu</td></tr>"
-		"<tr><th>5</th><td>%llu</td><th>6</th><td>%llu</td></tr>"
-		"<tr><th>7</th><td>%llu</td><th>8</th><td>%llu</td></tr>"
-		"<tr><th>9</th><td>%llu</td><th>10</th><td>%llu</td></tr>"
+	"<tr><th style='width: 2em;'>1</th><td>%llu</td><th style='width: 2em;'>2</th><td>%llu</td></tr>"
+	"<tr><th>3</th><td>%llu</td><th>4</th><td>%llu</td></tr>"
+	"<tr><th>5</th><td>%llu</td><th>6</th><td>%llu</td></tr>"
+	"<tr><th>7</th><td>%llu</td><th>8</th><td>%llu</td></tr>"
+	"<tr><th>9</th><td>%llu</td><th>10</th><td>%llu</td></tr>"
 	"</table>"
 	"<h4>Error details</h4>"
 	"<table>"
-		"<tr><th colspan='2'>Error text</th></tr>"
-		"<tr><th style='width: 5em;'>Count</th><th>Last seen</th></tr>";
+	"<tr><th colspan='2'>Error text</th></tr>"
+	"<tr><th style='width: 5em;'>Count</th><th>Last seen</th></tr>";
 
-extern const char sHtmlResultTableRow [] =
+extern const char sHtmlResultTableRow[] =
 	"<tr><td colspan='2'>%s</td></tr><tr><td>%llu</td><td>%s</td></tr>";
 
 extern const char sHtmlResultBodyLow[] =
@@ -220,31 +220,30 @@ extern const char sJsonApiResultError[] =
 extern const char sJsonApiConnectionError[] =
 	"{\"last_seen\":%llu,\"text\":\"%s\"}";
 
-extern const char sJsonApiFormat [] =
-"{"
+extern const char sJsonApiFormat[] =
+	"{"
 	"\"version\":\"%s\","
 
 	"\"hashrate\":{"
-		"\"threads\":[%s],"
-		"\"total\":%s,"
-		"\"highest\":%s"
+	"\"threads\":[%s],"
+	"\"total\":%s,"
+	"\"highest\":%s"
 	"},"
 
 	"\"results\":{"
-		"\"diff_current\":%llu,"
-		"\"shares_good\":%llu,"
-		"\"shares_total\":%llu,"
-		"\"avg_time\":%.1f,"
-		"\"hashes_total\":%llu,"
-		"\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu],"
-		"\"error_log\":[%s]"
+	"\"diff_current\":%llu,"
+	"\"shares_good\":%llu,"
+	"\"shares_total\":%llu,"
+	"\"avg_time\":%.1f,"
+	"\"hashes_total\":%llu,"
+	"\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu],"
+	"\"error_log\":[%s]"
 	"},"
 
 	"\"connection\":{"
-		"\"pool\": \"%s\","
-		"\"uptime\":%llu,"
-		"\"ping\":%llu,"
-		"\"error_log\":[%s]"
+	"\"pool\": \"%s\","
+	"\"uptime\":%llu,"
+	"\"ping\":%llu,"
+	"\"error_log\":[%s]"
 	"}"
-"}";
-
+	"}";
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 5e3384a63..c50211d1e 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -26,16 +26,15 @@
 
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/jext.hpp"
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/utility.hpp"
 
+#include <algorithm>
+#include <math.h>
+#include <numeric>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 #include <vector>
-#include <numeric>
-#include <algorithm>
 
 #ifdef _WIN32
 #define strcasecmp _stricmp
@@ -44,18 +43,34 @@
 #include <cpuid.h>
 #endif
 
-
 using namespace rapidjson;
 
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum {
-	aPoolList, sCurrency, bTlsSecureAlgo, iCallTimeout, iNetRetry, iGiveUpLimit, iVerboseLevel, bPrintMotd, iAutohashTime,
-	bDaemonMode, sOutputFile, iHttpdPort, sHttpLogin, sHttpPass, bPreferIpv4, bAesOverride, sUseSlowMem
+enum configEnum
+{
+	aPoolList,
+	sCurrency,
+	bTlsSecureAlgo,
+	iCallTimeout,
+	iNetRetry,
+	iGiveUpLimit,
+	iVerboseLevel,
+	bPrintMotd,
+	iAutohashTime,
+	bDaemonMode,
+	sOutputFile,
+	iHttpdPort,
+	sHttpLogin,
+	sHttpPass,
+	bPreferIpv4,
+	bAesOverride,
+	sUseSlowMem
 };
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -64,68 +79,61 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aPoolList, "pool_list", kArrayType },
-	{ sCurrency, "currency", kStringType },
-	{ bTlsSecureAlgo, "tls_secure_algo", kTrueType },
-	{ iCallTimeout, "call_timeout", kNumberType },
-	{ iNetRetry, "retry_time", kNumberType },
-	{ iGiveUpLimit, "giveup_limit", kNumberType },
-	{ iVerboseLevel, "verbose_level", kNumberType },
-	{ bPrintMotd, "print_motd", kTrueType },
-	{ iAutohashTime, "h_print_time", kNumberType },
-	{ bDaemonMode, "daemon_mode", kTrueType },
-	{ sOutputFile, "output_file", kStringType },
-	{ iHttpdPort, "httpd_port", kNumberType },
-	{ sHttpLogin, "http_login", kStringType },
-	{ sHttpPass, "http_pass", kStringType },
-	{ bPreferIpv4, "prefer_ipv4", kTrueType },
-	{ bAesOverride, "aes_override", kNullType },
-	{ sUseSlowMem, "use_slow_memory", kStringType }
-};
-
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+	{aPoolList, "pool_list", kArrayType},
+	{sCurrency, "currency", kStringType},
+	{bTlsSecureAlgo, "tls_secure_algo", kTrueType},
+	{iCallTimeout, "call_timeout", kNumberType},
+	{iNetRetry, "retry_time", kNumberType},
+	{iGiveUpLimit, "giveup_limit", kNumberType},
+	{iVerboseLevel, "verbose_level", kNumberType},
+	{bPrintMotd, "print_motd", kTrueType},
+	{iAutohashTime, "h_print_time", kNumberType},
+	{bDaemonMode, "daemon_mode", kTrueType},
+	{sOutputFile, "output_file", kStringType},
+	{iHttpdPort, "httpd_port", kNumberType},
+	{sHttpLogin, "http_login", kStringType},
+	{sHttpPass, "http_pass", kStringType},
+	{bPreferIpv4, "prefer_ipv4", kTrueType},
+	{bAesOverride, "aes_override", kNullType},
+	{sUseSlowMem, "use_slow_memory", kStringType}};
+
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
-	{ "aeon7",                   {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555" },
-	{ "bbscoin",                 {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)}, nullptr },
-	{ "bittube",                 {POW(cryptonight_bittube2)},  {POW(cryptonight_gpu)}, "mining.bit.tube:13333" },
-	{ "cryptonight",             {POW(cryptonight)},           {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_bittube2",    {POW(cryptonight_bittube2)},  {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_masari",      {POW(cryptonight_masari)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_haven",       {POW(cryptonight_haven)},     {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_heavy",       {POW(cryptonight_heavy)},     {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_lite",        {POW(cryptonight_lite)},      {POW(cryptonight_aeon)},      nullptr },
-	{ "cryptonight_lite_v7",     {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)},      nullptr },
-	{ "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)},      {POW(cryptonight_aeon)},      nullptr },
-	{ "cryptonight_r",           {POW(cryptonight_r)},         {POW(cryptonight_r)}, nullptr },
-	{ "cryptonight_superfast",   {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_turtle",      {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
-	{ "cryptonight_v7",          {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8",          {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr },
-	{ "cryptonight_v8_double",   {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8_half",     {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v7_stellite", {POW(cryptonight_stellite)},  {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_gpu",         {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)},       "pool.ryo-currency.com:3333" },
-	{ "cryptonight_conceal",     {POW(cryptonight_conceal)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "freehaven",               {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "graft",                   {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "haven",                   {POW(cryptonight_haven)},     {POW(cryptonight_gpu)}, nullptr },
-	{ "lethean",                 {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "masari",                  {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "monero",                  {POW(cryptonight_r)},         {POW(cryptonight_r)}, "pool.usxmrpool.com:3333" },
-	{ "qrl",             	     {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "ryo",                     {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" },
-	{ "stellite",                {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "turtlecoin",              {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr },
-	{ "plenteum",			     {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
-	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "xcash",                   {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr }
-};
-
-constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
+	{"bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr},
+	{"bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333"},
+	{"cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_heavy", {POW(cryptonight_heavy)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_lite", {POW(cryptonight_lite)}, {POW(cryptonight_aeon)}, nullptr},
+	{"cryptonight_lite_v7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr},
+	{"cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)}, {POW(cryptonight_aeon)}, nullptr},
+	{"cryptonight_r", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr},
+	{"cryptonight_superfast", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_turtle", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr},
+	{"cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr},
+	{"cryptonight_v8_double", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"},
+	{"cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr},
+	{"graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr},
+	{"haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr},
+	{"lethean", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr},
+	{"masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr},
+	{"qrl", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr},
+	{"ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"},
+	{"torque", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr},
+	{"plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr},
+	{"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}};
+
+constexpr size_t coin_algo_size = (sizeof(coins) / sizeof(coins[0]));
 
 inline bool checkType(Type have, Type want)
 {
@@ -242,7 +250,10 @@ bool jconf::PrintMotd()
 
 uint64_t jconf::GetAutohashTime()
 {
-	return prv->configValues[iAutohashTime]->GetUint64();
+	if (xmrstak::params::inst().h_print_time == -1)
+		return prv->configValues[iAutohashTime]->GetUint64();
+	else
+		return uint64_t(xmrstak::params::inst().h_print_time);
 }
 
 uint16_t jconf::GetHttpdPort()
@@ -270,12 +281,15 @@ bool jconf::DaemonMode()
 
 const char* jconf::GetOutputFile()
 {
-	return prv->configValues[sOutputFile]->GetString();
+	if(xmrstak::params::inst().outputFile.length() > 0)
+		return xmrstak::params::inst().outputFile.c_str();
+	else
+		return prv->configValues[sOutputFile]->GetString();
 }
 
 void jconf::cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
 {
-	memset(val, 0, sizeof(int32_t)*4);
+	memset(val, 0, sizeof(int32_t) * 4);
 
 #ifdef _WIN32
 	__cpuidex(val, eax, ecx);
@@ -326,7 +340,7 @@ std::string jconf::GetMiningCoin()
 void jconf::GetAlgoList(std::string& list)
 {
 	list.reserve(256);
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		list += "\t- ";
 		list += coins[i].coin_name;
@@ -338,7 +352,7 @@ bool jconf::IsOnAlgoList(std::string& needle)
 {
 	std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower);
 
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		if(needle == coins[i].coin_name)
 			return true;
@@ -350,7 +364,7 @@ const char* jconf::GetDefaultPool(const char* needle)
 {
 	const char* default_example = "pool.example.com:3333";
 
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		if(strcmp(needle, coins[i].coin_name) == 0)
 		{
@@ -366,22 +380,22 @@ const char* jconf::GetDefaultPool(const char* needle)
 
 bool jconf::parse_file(const char* sFilename, bool main_conf)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -396,7 +410,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -420,7 +434,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf)
 
 	Document& root = main_conf ? prv->jsonDoc : prv->jsonDocPools;
 
-	root.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	root.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(root.HasParseError())
@@ -514,11 +528,11 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 	std::vector<size_t> pool_weights;
 	pool_weights.reserve(pool_cnt);
 
-	const char* aPoolValues[] = { "pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight" };
-	Type poolValTypes[] = { kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType };
+	const char* aPoolValues[] = {"pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight"};
+	Type poolValTypes[] = {kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType};
 
-	constexpr size_t pvcnt = sizeof(aPoolValues)/sizeof(aPoolValues[0]);
-	for(uint32_t i=0; i < pool_cnt; i++)
+	constexpr size_t pvcnt = sizeof(aPoolValues) / sizeof(aPoolValues[0]);
+	for(uint32_t i = 0; i < pool_cnt; i++)
 	{
 		const Value& oThdConf = prv->configValues[aPoolList]->GetArray()[i];
 
@@ -528,7 +542,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 			return false;
 		}
 
-		for(uint32_t j=0; j < pvcnt; j++)
+		for(uint32_t j = 0; j < pvcnt; j++)
 		{
 			const Value* v;
 			if((v = GetObjectMember(oThdConf, aPoolValues[j])) == nullptr)
@@ -620,7 +634,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 		return false;
 	}
 
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		if(ctmp == coins[i].coin_name)
 		{
diff --git a/xmrstak/jconf.hpp b/xmrstak/jconf.hpp
index 102b70f54..17cbf5b9a 100644
--- a/xmrstak/jconf.hpp
+++ b/xmrstak/jconf.hpp
@@ -1,26 +1,31 @@
 #pragma once
 
-#include "xmrstak/misc/environment.hpp"
-#include "xmrstak/misc/coinDescription.hpp"
 #include "params.hpp"
+#include "xmrstak/misc/coinDescription.hpp"
+#include "xmrstak/misc/environment.hpp"
 
 #include <stdlib.h>
 #include <string>
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
 		auto& env = xmrstak::environment::inst();
 		if(env.pJconfConfig == nullptr)
-			env.pJconfConfig = new jconf;
+		{
+			std::unique_lock<std::mutex> lck(env.update);
+			if(env.pJconfConfig == nullptr)
+				env.pJconfConfig = new jconf;
+		}
 		return env.pJconfConfig;
 	};
 
 	bool parse_config(const char* sFilename, const char* sFilenamePools);
 
-	struct pool_cfg {
+	struct pool_cfg
+	{
 		const char* sPoolAddr;
 		const char* sWalletAddr;
 		const char* sRigId;
@@ -38,7 +43,8 @@ class jconf
 	uint64_t GetPoolCount();
 	bool GetPoolConfig(size_t id, pool_cfg& cfg);
 
-	enum slow_mem_cfg {
+	enum slow_mem_cfg
+	{
 		always_use,
 		no_mlck,
 		print_warning,
@@ -80,7 +86,7 @@ class jconf
 
 	slow_mem_cfg GetSlowMemSetting();
 
-private:
+  private:
 	jconf();
 
 	bool parse_file(const char* sFilename, bool main_conf);
diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp
index 65dee143c..b3b119226 100644
--- a/xmrstak/misc/coinDescription.hpp
+++ b/xmrstak/misc/coinDescription.hpp
@@ -2,86 +2,88 @@
 
 #include "xmrstak/backend/cryptonight.hpp"
 
+#include <algorithm>
 #include <stdlib.h>
 #include <string>
 #include <vector>
-#include <algorithm>
 
 namespace xmrstak
 {
-	struct coinDescription
-	{
-		xmrstak_algo algo = {xmrstak_algo_id::invalid_algo};
-		uint8_t fork_version = 0u;
-		xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo};
+struct coinDescription
+{
+	xmrstak_algo algo = {xmrstak_algo_id::invalid_algo};
+	uint8_t fork_version = 0u;
+	xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo};
 
-		coinDescription() = default;
+	coinDescription() = default;
 
-		coinDescription(
-			const xmrstak_algo in_algo,
-			const uint8_t in_fork_version = 0,
-			xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo
-		) :
-			algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version)
-		{}
+	coinDescription(
+		const xmrstak_algo in_algo,
+		const uint8_t in_fork_version = 0,
+		xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo) :
+		algo(in_algo),
+		algo_root(in_algo_root),
+		fork_version(in_fork_version)
+	{
+	}
 
-		inline xmrstak_algo GetMiningAlgo() const { return algo; }
-		inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; }
-		inline uint8_t GetMiningForkVersion() const { return fork_version; }
-	};
+	inline xmrstak_algo GetMiningAlgo() const { return algo; }
+	inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; }
+	inline uint8_t GetMiningForkVersion() const { return fork_version; }
+};
 
-	struct coin_selection
-	{
-		const char* coin_name = nullptr;
-		/* [0] -> user pool
+struct coin_selection
+{
+	const char* coin_name = nullptr;
+	/* [0] -> user pool
 		 * [1] -> dev pool
 		 */
-		coinDescription pool_coin[2];
-		const char* default_pool = nullptr;
+	coinDescription pool_coin[2];
+	const char* default_pool = nullptr;
 
-		coin_selection() = default;
+	coin_selection() = default;
 
-		coin_selection(
-			const char* in_coin_name,
-			const coinDescription user_coinDescription,
-			const coinDescription dev_coinDescription,
-			const char* in_default_pool
-		) :
-			coin_name(in_coin_name), default_pool(in_default_pool)
-		{
-			pool_coin[0] = user_coinDescription;
-			pool_coin[1] = dev_coinDescription;
-		}
+	coin_selection(
+		const char* in_coin_name,
+		const coinDescription user_coinDescription,
+		const coinDescription dev_coinDescription,
+		const char* in_default_pool) :
+		coin_name(in_coin_name),
+		default_pool(in_default_pool)
+	{
+		pool_coin[0] = user_coinDescription;
+		pool_coin[1] = dev_coinDescription;
+	}
 
-		/** get coin description for the pool
+	/** get coin description for the pool
 		 *
 		 * @param poolId 0 select dev pool, else the user pool is selected
 		 */
-		inline coinDescription GetDescription(size_t poolId) const {
-			coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]);
-			return tmp;
-		}
+	inline coinDescription GetDescription(size_t poolId) const
+	{
+		coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]);
+		return tmp;
+	}
 
-		/** return all POW algorithm for the current selected currency
+	/** return all POW algorithm for the current selected currency
 		 *
 		 * @return required POW algorithms without duplicated entries
 		 */
-		inline std::vector<xmrstak_algo> GetAllAlgorithms()
-		{
-			std::vector<xmrstak_algo> allAlgos = {
-				GetDescription(0).GetMiningAlgo(),
-				GetDescription(0).GetMiningAlgoRoot(),
-				GetDescription(1).GetMiningAlgo(),
-				GetDescription(1).GetMiningAlgoRoot()
-			};
+	inline std::vector<xmrstak_algo> GetAllAlgorithms()
+	{
+		std::vector<xmrstak_algo> allAlgos = {
+			GetDescription(0).GetMiningAlgo(),
+			GetDescription(0).GetMiningAlgoRoot(),
+			GetDescription(1).GetMiningAlgo(),
+			GetDescription(1).GetMiningAlgoRoot()};
 
-			std::sort(allAlgos.begin(), allAlgos.end());
-			std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo);
-			auto last = std::unique(allAlgos.begin(), allAlgos.end());
-			// remove duplicated algorithms
-			allAlgos.erase(last, allAlgos.end());
+		std::sort(allAlgos.begin(), allAlgos.end());
+		std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo);
+		auto last = std::unique(allAlgos.begin(), allAlgos.end());
+		// remove duplicated algorithms
+		allAlgos.erase(last, allAlgos.end());
 
-			return allAlgos;
-		}
-	};
+		return allAlgos;
+	}
+};
 } // namespace xmrstak
diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp
index 3f79df44c..ae81f62c5 100644
--- a/xmrstak/misc/configEditor.hpp
+++ b/xmrstak/misc/configEditor.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
 #include <atomic>
-#include <string>
 #include <fstream>
-#include <streambuf>
 #include <regex>
+#include <streambuf>
+#include <string>
 
 #include "../version.hpp"
 
@@ -17,16 +17,15 @@ struct configEditor
 
 	configEditor()
 	{
-
 	}
 
-	static bool file_exist( const std::string filename)
+	static bool file_exist(const std::string filename)
 	{
 		std::ifstream fstream(filename);
 		return fstream.good();
 	}
 
-	void set( const std::string && content)
+	void set(const std::string&& content)
 	{
 		m_fileContent = content;
 	}
@@ -36,8 +35,7 @@ struct configEditor
 		std::ifstream fstream(filename);
 		m_fileContent = std::string(
 			(std::istreambuf_iterator<char>(fstream)),
-			std::istreambuf_iterator<char>()
-		);
+			std::istreambuf_iterator<char>());
 		return fstream.good();
 	}
 
@@ -70,7 +68,6 @@ struct configEditor
 	{
 		m_fileContent = std::regex_replace(m_fileContent, std::regex(search), substring);
 	}
-
 };
 
 } // namespace xmrstak
diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp
index c39237eab..529cc9453 100644
--- a/xmrstak/misc/console.cpp
+++ b/xmrstak/misc/console.cpp
@@ -23,11 +23,11 @@
 
 #include "xmrstak/misc/console.hpp"
 
-#include <time.h>
+#include <cstdlib>
+#include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
-#include <stdarg.h>
-#include <cstdlib>
+#include <time.h>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -37,15 +37,15 @@ int get_key()
 	DWORD mode, rd;
 	HANDLE h;
 
-	if ((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL)
+	if((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL)
 		return -1;
 
-	GetConsoleMode( h, &mode );
-	SetConsoleMode( h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT) );
+	GetConsoleMode(h, &mode);
+	SetConsoleMode(h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT));
 
 	int c = 0;
-	ReadConsole( h, &c, 1, &rd, NULL );
-	SetConsoleMode( h, mode );
+	ReadConsole(h, &c, 1, &rd, NULL);
+	SetConsoleMode(h, mode);
 
 	return c;
 }
@@ -90,20 +90,20 @@ void reset_colour()
 }
 
 #else
+#include <stdio.h>
 #include <termios.h>
 #include <unistd.h>
-#include <stdio.h>
 
 int get_key()
 {
 	struct termios oldattr, newattr;
 	int ch;
-	tcgetattr( STDIN_FILENO, &oldattr );
+	tcgetattr(STDIN_FILENO, &oldattr);
 	newattr = oldattr;
-	newattr.c_lflag &= ~( ICANON | ECHO );
-	tcsetattr( STDIN_FILENO, TCSANOW, &newattr );
+	newattr.c_lflag &= ~(ICANON | ECHO);
+	tcsetattr(STDIN_FILENO, TCSANOW, &newattr);
 	ch = getchar();
-	tcsetattr( STDIN_FILENO, TCSANOW, &oldattr );
+	tcsetattr(STDIN_FILENO, TCSANOW, &oldattr);
 	return ch;
 }
 
@@ -182,17 +182,17 @@ void printer::print_msg(verbosity verbose, const char* fmt, ...)
 
 	va_list args;
 	va_start(args, fmt);
-	vsnprintf(buf+bpos, sizeof(buf)-bpos, fmt, args);
+	vsnprintf(buf + bpos, sizeof(buf) - bpos, fmt, args);
 	va_end(args);
 	bpos = strlen(buf);
 
-	if(bpos+2 >= sizeof(buf))
+	if(bpos + 2 >= sizeof(buf))
 		return;
 
 	buf[bpos] = '\n';
-	buf[bpos+1] = '\0';
+	buf[bpos + 1] = '\0';
 
-    print_str(buf);
+	print_str(buf);
 }
 
 void printer::print_str(const char* str)
diff --git a/xmrstak/misc/console.hpp b/xmrstak/misc/console.hpp
index 6df6597c6..12efef6c7 100644
--- a/xmrstak/misc/console.hpp
+++ b/xmrstak/misc/console.hpp
@@ -4,8 +4,17 @@
 
 #include <mutex>
 
-
-enum out_colours { K_RED, K_GREEN, K_BLUE, K_YELLOW, K_CYAN, K_MAGENTA, K_WHITE, K_NONE };
+enum out_colours
+{
+	K_RED,
+	K_GREEN,
+	K_BLUE,
+	K_YELLOW,
+	K_CYAN,
+	K_MAGENTA,
+	K_WHITE,
+	K_NONE
+};
 
 // Warning - on Linux get_key will detect control keys, but not on Windows.
 // We will only use it for alphanum keys anyway.
@@ -21,16 +30,29 @@ inline long long unsigned int int_port(size_t i)
 	return i;
 }
 
-enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LDEBUG = 10, LINF = 100};
+enum verbosity : size_t
+{
+	L0 = 0,
+	L1 = 1,
+	L2 = 2,
+	L3 = 3,
+	L4 = 4,
+	LDEBUG = 10,
+	LINF = 100
+};
 
 class printer
 {
-public:
+  public:
 	static inline printer* inst()
 	{
 		auto& env = xmrstak::environment::inst();
 		if(env.pPrinter == nullptr)
-			env.pPrinter = new printer;
+		{
+			std::unique_lock<std::mutex> lck(env.update);
+			if(env.pPrinter == nullptr)
+				env.pPrinter = new printer;
+		}
 		return env.pPrinter;
 	};
 
@@ -39,7 +61,7 @@ class printer
 	void print_str(const char* str);
 	bool open_logfile(const char* file);
 
-private:
+  private:
 	printer();
 
 	std::mutex print_mutex;
diff --git a/xmrstak/misc/environment.cpp b/xmrstak/misc/environment.cpp
new file mode 100644
index 000000000..9f1be511d
--- /dev/null
+++ b/xmrstak/misc/environment.cpp
@@ -0,0 +1,19 @@
+#include "environment.hpp"
+
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cpu/crypto/cryptonight.h"
+#include "xmrstak/params.hpp"
+#include "xmrstak/misc/executor.hpp"
+#include "xmrstak/jconf.hpp"
+
+namespace xmrstak
+{
+void environment::init_singeltons()
+{
+	printer::inst();
+	globalStates::inst();
+	jconf::inst();
+	executor::inst();
+	params::inst();
+}
+}
diff --git a/xmrstak/misc/environment.hpp b/xmrstak/misc/environment.hpp
index b67c85874..f37aedd61 100644
--- a/xmrstak/misc/environment.hpp
+++ b/xmrstak/misc/environment.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <mutex>
+
 class printer;
 class jconf;
 class executor;
@@ -19,7 +21,10 @@ struct environment
 		if(env == nullptr)
 		{
 			if(init == nullptr)
+			{
 				env = new environment;
+				env->init_singeltons();
+			}
 			else
 				env = init;
 		}
@@ -36,6 +41,11 @@ struct environment
 	jconf* pJconfConfig = nullptr;
 	executor* pExecutor = nullptr;
 	params* pParams = nullptr;
+
+	std::mutex update;
+
+private:
+	void init_singeltons();
 };
 
 } // namespace xmrstak
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index 79d4731e6..0266312d1 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -21,31 +21,30 @@
   *
   */
 
-#include "xmrstak/jconf.hpp"
 #include "executor.hpp"
+#include "xmrstak/jconf.hpp"
 #include "xmrstak/net/jpsock.hpp"
 
 #include "telemetry.hpp"
-#include "xmrstak/backend/miner_work.hpp"
-#include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/backend/backendConnector.hpp"
+#include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/backend/miner_work.hpp"
 
+#include "xmrstak/donate-level.hpp"
+#include "xmrstak/http/webdesign.hpp"
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/console.hpp"
-#include "xmrstak/donate-level.hpp"
 #include "xmrstak/version.hpp"
-#include "xmrstak/http/webdesign.hpp"
 
-#include <thread>
-#include <string>
-#include <cmath>
 #include <algorithm>
-#include <functional>
 #include <assert.h>
+#include <cmath>
+#include <functional>
+#include <string>
+#include <thread>
 #include <time.h>
 
-
 #ifdef _WIN32
 #define strncasecmp _strnicmp
 #endif // _WIN32
@@ -63,7 +62,7 @@ void executor::push_timed_event(ex_event&& ev, size_t sec)
 void executor::ex_clock_thd()
 {
 	size_t tick = 0;
-	while (true)
+	while(true)
 	{
 		std::this_thread::sleep_for(std::chrono::milliseconds(size_t(iTickTime)));
 
@@ -76,7 +75,7 @@ void executor::ex_clock_thd()
 		// Service timed events
 		std::unique_lock<std::mutex> lck(timed_event_mutex);
 		std::list<timed_event>::iterator ev = lTimedEvents.begin();
-		while (ev != lTimedEvents.end())
+		while(ev != lTimedEvents.end())
 		{
 			ev->ticks_left--;
 			if(ev->ticks_left == 0)
@@ -96,7 +95,8 @@ bool executor::get_live_pools(std::vector<jpsock*>& eval_pools, bool is_dev)
 	size_t limit = jconf::inst()->GetGiveUpLimit();
 	size_t wait = jconf::inst()->GetNetRetry();
 
-	if(limit == 0 || is_dev) limit = (-1); //No limit = limit of 2^64-1
+	if(limit == 0 || is_dev)
+		limit = (-1); //No limit = limit of 2^64-1
 
 	size_t pool_count = 0;
 	size_t over_limit = 0;
@@ -330,7 +330,7 @@ void executor::on_sock_ready(size_t pool_id)
 	{
 		if(pool->have_call_error() && !pool->is_dev_pool())
 		{
-			std::string str = "Login error: " +  pool->get_call_error();
+			std::string str = "Login error: " + pool->get_call_error();
 			log_socket_error(pool, std::move(str));
 		}
 
@@ -369,7 +369,8 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob)
 	dat.pool_id = pool_id;
 
 	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(oPoolJob.sJobID, oPoolJob.bWorkBlob,
-		oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), dat);
+												  oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight),
+		dat);
 
 	if(dat.pool_id != pool_id)
 	{
@@ -420,12 +421,11 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
 		//Ignore errors silently
 		if(pool->is_running() && pool->is_logged_in())
 			pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, backend_name,
-			backend_hashcount, total_hashcount, oResult.algorithm
-		);
+				backend_hashcount, total_hashcount, oResult.algorithm);
 		return;
 	}
 
-	if (!pool->is_running() || !pool->is_logged_in())
+	if(!pool->is_running() || !pool->is_logged_in())
 	{
 		log_result_error("[NETWORK ERROR]");
 		return;
@@ -433,25 +433,42 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
 
 	size_t t_start = get_timestamp_ms();
 	bool bResult = pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult,
-		backend_name, backend_hashcount, total_hashcount, oResult.algorithm
-	);
+		backend_name, backend_hashcount, total_hashcount, oResult.algorithm);
 	size_t t_len = get_timestamp_ms() - t_start;
 
 	if(t_len > 0xFFFF)
 		t_len = 0xFFFF;
 	iPoolCallTimes.push_back((uint16_t)t_len);
 
+	std::string name(backend_name);
+	std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+
 	if(bResult)
 	{
 		uint64_t* targets = (uint64_t*)oResult.bResult;
 		log_result_ok(t64_to_diff(targets[3]));
-		printer::inst()->print_msg(L3, "Result accepted by the pool.");
+
+		if (pvThreads->at(oResult.iThreadId)->backendType == xmrstak::iBackend::BackendType::CPU)
+		{
+			printer::inst()->print_msg(L3, "CPU: Share accepted. Pool: %s", pool->get_pool_addr());
+		}
+		else
+		{
+			printer::inst()->print_msg(L3, "%s GPU %u: Share accepted. Pool: %s", name.c_str(), pvThreads->at(oResult.iThreadId)->iGpuIndex, pool->get_pool_addr());
+		}
 	}
 	else
 	{
 		if(!pool->have_sock_error())
 		{
-			printer::inst()->print_msg(L3, "Result rejected by the pool.");
+			if (pvThreads->at(oResult.iThreadId)->backendType == xmrstak::iBackend::BackendType::CPU)
+			{
+				printer::inst()->print_msg(L3, "CPU: Share rejected. Pool: %s", pool->get_pool_addr());
+			}
+			else
+			{
+				printer::inst()->print_msg(L3, "%s GPU %u: Share rejected. Pool: %s", name.c_str(), pvThreads->at(oResult.iThreadId)->iGpuIndex, pool->get_pool_addr());
+			}
 
 			std::string error = pool->get_call_error();
 
@@ -477,12 +494,14 @@ void disable_sigpipe()
 	memset(&sa, 0, sizeof(sa));
 	sa.sa_handler = SIG_IGN;
 	sa.sa_flags = 0;
-	if (sigaction(SIGPIPE, &sa, 0) == -1)
+	if(sigaction(SIGPIPE, &sa, 0) == -1)
 		printer::inst()->print_msg(L1, "ERROR: Call to sigaction failed!");
 }
 
 #else
-inline void disable_sigpipe() {}
+inline void disable_sigpipe()
+{
+}
 #endif
 
 void executor::ex_main()
@@ -496,7 +515,7 @@ void executor::ex_main()
 	// \todo collect all backend threads
 	pvThreads = xmrstak::BackendConnector::thread_starter(oWork);
 
-	if(pvThreads->size()==0)
+	if(pvThreads->size() == 0)
 	{
 		printer::inst()->print_msg(L1, "ERROR: No miner backend enabled.");
 		win_exit();
@@ -508,11 +527,11 @@ void executor::ex_main()
 	size_t pc = jconf::inst()->GetPoolCount();
 	bool dev_tls = true;
 	bool already_have_cli_pool = false;
-	size_t i=0;
+	size_t i = 0;
 	for(; i < pc; i++)
 	{
 		jconf::pool_cfg cfg;
- 		jconf::inst()->GetPoolConfig(i, cfg);
+		jconf::inst()->GetPoolConfig(i, cfg);
 #ifdef CONF_NO_TLS
 		if(cfg.tls)
 		{
@@ -520,7 +539,8 @@ void executor::ex_main()
 			win_exit();
 		}
 #endif
-		if(!cfg.tls) dev_tls = false;
+		if(!cfg.tls)
+			dev_tls = false;
 
 		if(!xmrstak::params::inst().poolURL.empty() && xmrstak::params::inst().poolURL == cfg.sPoolAddr)
 		{
@@ -531,11 +551,12 @@ void executor::ex_main()
 			const char* rigid = params.userSetRigid ? params.poolRigid.c_str() : cfg.sRigId;
 			const char* pwd = params.userSetPwd ? params.poolPasswd.c_str() : cfg.sPasswd;
 			bool nicehash = cfg.nicehash || params.nicehashMode;
+			bool tls = params.poolUseTls;
 
-			pools.emplace_back(i+1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash);
+			pools.emplace_back(i + 1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, tls, cfg.tls_fingerprint, nicehash);
 		}
 		else
-			pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash);
+			pools.emplace_back(i + 1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash);
 	}
 
 	if(!xmrstak::params::inst().poolURL.empty() && !already_have_cli_pool)
@@ -547,7 +568,7 @@ void executor::ex_main()
 			win_exit();
 		}
 
-		pools.emplace_back(i+1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode);
+		pools.emplace_back(i + 1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode);
 	}
 
 	switch(jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo())
@@ -604,10 +625,10 @@ void executor::ex_main()
 		push_timed_event(ex_event(EV_HASHRATE_LOOP), jconf::inst()->GetAutohashTime());
 
 	size_t cnt = 0;
-	while (true)
+	while(true)
 	{
 		ev = oEventQ.pop();
-		switch (ev.iName)
+		switch(ev.iName)
 		{
 		case EV_SOCK_READY:
 			on_sock_ready(ev.iPoolId);
@@ -638,9 +659,9 @@ void executor::ex_main()
 		}
 
 		case EV_PERF_TICK:
-			for (i = 0; i < pvThreads->size(); i++)
+			for(i = 0; i < pvThreads->size(); i++)
 				telem->push_perf_value(i, pvThreads->at(i)->iHashCount.load(std::memory_order_relaxed),
-				pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed));
+					pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed));
 
 			if((cnt++ & 0xF) == 0) //Every 16 ticks
 			{
@@ -648,7 +669,7 @@ void executor::ex_main()
 				double fTelem;
 				bool normal = true;
 
-				for (i = 0; i < pvThreads->size(); i++)
+				for(i = 0; i < pvThreads->size(); i++)
 				{
 					fTelem = telem->calc_telemetry_data(10000, i);
 					if(std::isnormal(fTelem))
@@ -709,7 +730,7 @@ bool executor::motd_filter_console(std::string& motd)
 	if(motd.size() > motd_max_length)
 		return false;
 
-	motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr)->bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n');}), motd.end());
+	motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr) -> bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n'); }), motd.end());
 	return motd.size() > 0;
 }
 
@@ -721,7 +742,7 @@ bool executor::motd_filter_web(std::string& motd)
 	std::string tmp;
 	tmp.reserve(motd.size() + 128);
 
-	for(size_t i=0; i < motd.size(); i++)
+	for(size_t i = 0; i < motd.size(); i++)
 	{
 		char c = motd[i];
 		switch(c)
@@ -763,7 +784,7 @@ void executor::hashrate_report(std::string& out)
 		std::string motd;
 		for(jpsock& pool : pools)
 		{
-			motd.empty();
+			motd.clear();
 			if(pool.get_pool_motd(motd) && motd_filter_console(motd))
 			{
 				out.append("Message from ").append(pool.get_pool_addr()).append(":\n");
@@ -774,17 +795,15 @@ void executor::hashrate_report(std::string& out)
 	}
 
 	char num[32];
-	double fTotal[3] = { 0.0, 0.0, 0.0};
+	double fTotal[3] = {0.0, 0.0, 0.0};
 
-	for( uint32_t b = 0; b < 4u; ++b)
+	for(uint32_t b = 0; b < 4u; ++b)
 	{
 		std::vector<xmrstak::iBackend*> backEnds;
 		std::copy_if(pvThreads->begin(), pvThreads->end(), std::back_inserter(backEnds),
-			[&](xmrstak::iBackend* backend)
-			{
+			[&](xmrstak::iBackend* backend) {
 				return backend->backendType == b;
-			}
-		);
+			});
 
 		size_t nthd = backEnds.size();
 		if(nthd != 0)
@@ -801,8 +820,8 @@ void executor::hashrate_report(std::string& out)
 			else
 				out.append(1, '\n');
 
-			double fTotalCur[3] = { 0.0, 0.0, 0.0};
-			for (i = 0; i < nthd; i++)
+			double fTotalCur[3] = {0.0, 0.0, 0.0};
+			for(i = 0; i < nthd; i++)
 			{
 				double fHps[3];
 
@@ -883,12 +902,11 @@ void executor::result_report(std::string& out)
 	size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes;
 	size_t ln = vMineResults.size();
 
-	for(size_t i=1; i < ln; i++)
+	for(size_t i = 1; i < ln; i++)
 		iTotalRes += vMineResults[i].count;
 
 	out.append("RESULT REPORT\n");
-	out.append("Currency         : ").
-		append(jconf::inst()->GetMiningCoin()).append("\n");
+	out.append("Currency         : ").append(jconf::inst()->GetMiningCoin()).append("\n");
 	if(iTotalRes == 0)
 	{
 		out.append("You haven't found any results yet.\n");
@@ -904,8 +922,7 @@ void executor::result_report(std::string& out)
 	snprintf(num, sizeof(num), " (%.1f %%)\n", 100.0 * iGoodRes / iTotalRes);
 
 	out.append("Difficulty       : ").append(std::to_string(iPoolDiff)).append(1, '\n');
-	out.append("Good results     : ").append(std::to_string(iGoodRes)).append(" / ").
-		append(std::to_string(iTotalRes)).append(num);
+	out.append("Good results     : ").append(std::to_string(iGoodRes)).append(" / ").append(std::to_string(iTotalRes)).append(num);
 
 	if(iPoolCallTimes.size() != 0)
 	{
@@ -916,10 +933,10 @@ void executor::result_report(std::string& out)
 	out.append("Pool-side hashes : ").append(std::to_string(iPoolHashes)).append(2, '\n');
 	out.append("Top 10 best results found:\n");
 
-	for(size_t i=0; i < 10; i += 2)
+	for(size_t i = 0; i < 10; i += 2)
 	{
 		snprintf(num, sizeof(num), "| %2llu | %16llu | %2llu | %16llu |\n",
-			int_port(i), int_port(iTopDiff[i]), int_port(i+1), int_port(iTopDiff[i+1]));
+			int_port(i), int_port(iTopDiff[i]), int_port(i + 1), int_port(iTopDiff[i + 1]));
 		out.append(num);
 	}
 
@@ -927,7 +944,7 @@ void executor::result_report(std::string& out)
 	if(ln > 1)
 	{
 		out.append("| Count | Error text                       | Last seen           |\n");
-		for(size_t i=1; i < ln; i++)
+		for(size_t i = 1; i < ln; i++)
 		{
 			snprintf(num, sizeof(num), "| %5llu | %-32.32s | %s |\n", int_port(vMineResults[i].count),
 				vMineResults[i].msg.c_str(), time_format(date, sizeof(date), vMineResults[i].time));
@@ -958,11 +975,11 @@ void executor::connection_report(std::string& out)
 		out.append("Connected since : <not connected>\n");
 
 	size_t n_calls = iPoolCallTimes.size();
-	if (n_calls > 1)
+	if(n_calls > 1)
 	{
 		//Not-really-but-good-enough median
-		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end());
-		out.append("Pool ping time  : ").append(std::to_string(iPoolCallTimes[n_calls/2])).append(" ms\n");
+		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end());
+		out.append("Pool ping time  : ").append(std::to_string(iPoolCallTimes[n_calls / 2])).append(" ms\n");
 	}
 	else
 		out.append("Pool ping time  : (n/a)\n");
@@ -972,7 +989,7 @@ void executor::connection_report(std::string& out)
 	if(ln > 0)
 	{
 		out.append("| Date                | Error text                                             |\n");
-		for(size_t i=0; i < ln; i++)
+		for(size_t i = 0; i < ln; i++)
 		{
 			snprintf(num, sizeof(num), "| %s | %-54.54s |\n",
 				time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str());
@@ -1024,7 +1041,7 @@ void executor::http_hashrate_report(std::string& out)
 		std::string motd;
 		for(jpsock& pool : pools)
 		{
-			motd.empty();
+			motd.clear();
 			if(pool.get_pool_motd(motd) && motd_filter_web(motd))
 			{
 				if(!have_motd)
@@ -1045,11 +1062,11 @@ void executor::http_hashrate_report(std::string& out)
 	snprintf(buffer, sizeof(buffer), sHtmlHashrateBodyHigh, (unsigned int)nthd + 3);
 	out.append(buffer);
 
-	double fTotal[3] = { 0.0, 0.0, 0.0};
+	double fTotal[3] = {0.0, 0.0, 0.0};
 	auto bTypePrev = static_cast<xmrstak::iBackend::BackendType>(0);
 	std::string name;
 	size_t j = 0;
-	for(size_t i=0; i < nthd; i++)
+	for(size_t i = 0; i < nthd; i++)
 	{
 		double fHps[3];
 		char csThreadTag[25];
@@ -1065,14 +1082,13 @@ void executor::http_hashrate_report(std::string& out)
 		}
 		snprintf(csThreadTag, sizeof(csThreadTag),
 			(99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"),
-			name.c_str(), (unsigned int)(j), (unsigned int)i
-		);
+			name.c_str(), (unsigned int)(j), (unsigned int)i);
 
 		fHps[0] = telem->calc_telemetry_data(10000, i);
 		fHps[1] = telem->calc_telemetry_data(60000, i);
 		fHps[2] = telem->calc_telemetry_data(900000, i);
 
-		num_a[0] = num_b[0] = num_c[0] ='\0';
+		num_a[0] = num_b[0] = num_c[0] = '\0';
 		hps_format(fHps[0], num_a, sizeof(num_a));
 		hps_format(fHps[1], num_b, sizeof(num_b));
 		hps_format(fHps[2], num_c, sizeof(num_c));
@@ -1085,7 +1101,7 @@ void executor::http_hashrate_report(std::string& out)
 		out.append(buffer);
 	}
 
-	num_a[0] = num_b[0] = num_c[0] = num_d[0] ='\0';
+	num_a[0] = num_b[0] = num_c[0] = num_d[0] = '\0';
 	hps_format(fTotal[0], num_a, sizeof(num_a));
 	hps_format(fTotal[1], num_b, sizeof(num_b));
 	hps_format(fTotal[2], num_c, sizeof(num_c));
@@ -1102,13 +1118,13 @@ void executor::http_result_report(std::string& out)
 
 	out.reserve(4096);
 
-	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html,  "Result Report");
+	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html, "Result Report");
 	out.append(buffer);
 
 	size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes;
 	size_t ln = vMineResults.size();
 
-	for(size_t i=1; i < ln; i++)
+	for(size_t i = 1; i < ln; i++)
 		iTotalRes += vMineResults[i].count;
 
 	double fGoodResPrc = 0.0;
@@ -1119,8 +1135,7 @@ void executor::http_result_report(std::string& out)
 	if(iPoolCallTimes.size() > 0)
 	{
 		using namespace std::chrono;
-		fAvgResTime = ((double)duration_cast<seconds>(system_clock::now() - tPoolConnTime).count())
-			/ iPoolCallTimes.size();
+		fAvgResTime = ((double)duration_cast<seconds>(system_clock::now() - tPoolConnTime).count()) / iPoolCallTimes.size();
 	}
 
 	snprintf(buffer, sizeof(buffer), sHtmlResultBodyHigh,
@@ -1132,7 +1147,7 @@ void executor::http_result_report(std::string& out)
 
 	out.append(buffer);
 
-	for(size_t i=1; i < vMineResults.size(); i++)
+	for(size_t i = 1; i < vMineResults.size(); i++)
 	{
 		snprintf(buffer, sizeof(buffer), sHtmlResultTableRow, vMineResults[i].msg.c_str(),
 			int_port(vMineResults[i].count), time_format(date, sizeof(date), vMineResults[i].time));
@@ -1149,7 +1164,7 @@ void executor::http_connection_report(std::string& out)
 
 	out.reserve(4096);
 
-	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html,  "Connection Report");
+	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html, "Connection Report");
 	out.append(buffer);
 
 	jpsock* pool = pick_pool_by_id(current_pool_id);
@@ -1157,16 +1172,16 @@ void executor::http_connection_report(std::string& out)
 		pool = pick_pool_by_id(last_usr_pool_id);
 
 	const char* cdate = "not connected";
-	if (pool != nullptr && pool->is_running() && pool->is_logged_in())
+	if(pool != nullptr && pool->is_running() && pool->is_logged_in())
 		cdate = time_format(date, sizeof(date), tPoolConnTime);
 
 	size_t n_calls = iPoolCallTimes.size();
 	unsigned int ping_time = 0;
-	if (n_calls > 1)
+	if(n_calls > 1)
 	{
 		//Not-really-but-good-enough median
-		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end());
-		ping_time = iPoolCallTimes[n_calls/2];
+		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end());
+		ping_time = iPoolCallTimes[n_calls / 2];
 	}
 
 	snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh,
@@ -1175,8 +1190,7 @@ void executor::http_connection_report(std::string& out)
 		cdate, ping_time);
 	out.append(buffer);
 
-
-	for(size_t i=0; i < vSocketLog.size(); i++)
+	for(size_t i = 0; i < vSocketLog.size(); i++)
 	{
 		snprintf(buffer, sizeof(buffer), sHtmlConnectionTableRow,
 			time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str());
@@ -1205,12 +1219,13 @@ void executor::http_json_report(std::string& out)
 	std::string hr_thds, res_error, cn_error;
 
 	size_t nthd = pvThreads->size();
-	double fTotal[3] = { 0.0, 0.0, 0.0};
+	double fTotal[3] = {0.0, 0.0, 0.0};
 	hr_thds.reserve(nthd * 32);
 
-	for(size_t i=0; i < nthd; i++)
+	for(size_t i = 0; i < nthd; i++)
 	{
-		if(i != 0) hr_thds.append(1, ',');
+		if(i != 0)
+			hr_thds.append(1, ',');
 
 		double fHps[3];
 		fHps[0] = telem->calc_telemetry_data(10000, i);
@@ -1238,7 +1253,7 @@ void executor::http_json_report(std::string& out)
 	size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes;
 	size_t ln = vMineResults.size();
 
-	for(size_t i=1; i < ln; i++)
+	for(size_t i = 1; i < ln; i++)
 		iTotalRes += vMineResults[i].count;
 
 	jpsock* pool = pick_pool_by_id(current_pool_id);
@@ -1258,10 +1273,11 @@ void executor::http_json_report(std::string& out)
 
 	char buffer[2048];
 	res_error.reserve((vMineResults.size() - 1) * 128);
-	for(size_t i=1; i < vMineResults.size(); i++)
+	for(size_t i = 1; i < vMineResults.size(); i++)
 	{
 		using namespace std::chrono;
-		if(i != 1) res_error.append(1, ',');
+		if(i != 1)
+			res_error.append(1, ',');
 
 		snprintf(buffer, sizeof(buffer), sJsonApiResultError, int_port(vMineResults[i].count),
 			int_port(duration_cast<seconds>(vMineResults[i].time.time_since_epoch()).count()),
@@ -1271,18 +1287,19 @@ void executor::http_json_report(std::string& out)
 
 	size_t n_calls = iPoolCallTimes.size();
 	size_t iPoolPing = 0;
-	if (n_calls > 1)
+	if(n_calls > 1)
 	{
 		//Not-really-but-good-enough median
-		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end());
-		iPoolPing = iPoolCallTimes[n_calls/2];
+		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end());
+		iPoolPing = iPoolCallTimes[n_calls / 2];
 	}
 
 	cn_error.reserve(vSocketLog.size() * 256);
-	for(size_t i=0; i < vSocketLog.size(); i++)
+	for(size_t i = 0; i < vSocketLog.size(); i++)
 	{
 		using namespace std::chrono;
-		if(i != 0) cn_error.append(1, ',');
+		if(i != 0)
+			cn_error.append(1, ',');
 
 		snprintf(buffer, sizeof(buffer), sJsonApiConnectionError,
 			int_port(duration_cast<seconds>(vSocketLog[i].time.time_since_epoch()).count()),
@@ -1291,7 +1308,7 @@ void executor::http_json_report(std::string& out)
 	}
 
 	size_t bb_size = 2048 + hr_thds.size() + res_error.size() + cn_error.size();
-	std::unique_ptr<char[]> bigbuf( new char[ bb_size ] );
+	std::unique_ptr<char[]> bigbuf(new char[bb_size]);
 
 	int bb_len = snprintf(bigbuf.get(), bb_size, sJsonApiFormat,
 		get_version_str().c_str(), hr_thds.c_str(), hr_buffer, a,
@@ -1338,8 +1355,7 @@ void executor::get_http_report(ex_event_name ev_id, std::string& data)
 	std::lock_guard<std::mutex> lck(httpMutex);
 
 	assert(pHttpString == nullptr);
-	assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS
-		|| ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON);
+	assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS || ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON);
 
 	pHttpString = &data;
 	httpReady = std::promise<void>();
diff --git a/xmrstak/misc/executor.hpp b/xmrstak/misc/executor.hpp
index be5ee6c2f..385b2f4e3 100644
--- a/xmrstak/misc/executor.hpp
+++ b/xmrstak/misc/executor.hpp
@@ -1,18 +1,18 @@
 #pragma once
 
-#include "thdq.hpp"
 #include "telemetry.hpp"
+#include "thdq.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/donate-level.hpp"
 #include "xmrstak/misc/environment.hpp"
 #include "xmrstak/net/msgstruct.hpp"
-#include "xmrstak/donate-level.hpp"
 
-#include <atomic>
 #include <array>
+#include <atomic>
+#include <chrono>
+#include <future>
 #include <list>
 #include <vector>
-#include <future>
-#include <chrono>
 
 class jpsock;
 
@@ -27,12 +27,16 @@ class minethd;
 
 class executor
 {
-public:
+  public:
 	static executor* inst()
 	{
 		auto& env = xmrstak::environment::inst();
 		if(env.pExecutor == nullptr)
-			env.pExecutor = new executor;
+		{
+			std::unique_lock<std::mutex> lck(env.update);
+			if(env.pExecutor == nullptr)
+				env.pExecutor = new executor;
+		}
 		return env.pExecutor;
 	};
 
@@ -43,13 +47,15 @@ class executor
 	inline void push_event(ex_event&& ev) { oEventQ.push(std::move(ev)); }
 	void push_timed_event(ex_event&& ev, size_t sec);
 
-private:
+  private:
 	struct timed_event
 	{
 		ex_event event;
 		size_t ticks_left;
 
-		timed_event(ex_event&& ev, size_t ticks) : event(std::move(ev)), ticks_left(ticks) {}
+		timed_event(ex_event&& ev, size_t ticks) :
+			event(std::move(ev)),
+			ticks_left(ticks) {}
 	};
 
 	inline void set_timestamp() { dev_timestamp = get_timestamp(); };
@@ -119,7 +125,8 @@ class executor
 		std::chrono::system_clock::time_point time;
 		std::string msg;
 
-		sck_error_log(std::string&& err) : msg(std::move(err))
+		sck_error_log(std::string&& err) :
+			msg(std::move(err))
 		{
 			time = std::chrono::system_clock::now();
 		}
@@ -134,12 +141,16 @@ class executor
 		std::string msg;
 		size_t count;
 
-		result_tally() : msg("[OK]"), count(0)
+		result_tally() :
+			msg("[OK]"),
+			count(0)
 		{
 			time = std::chrono::system_clock::now();
 		}
 
-		result_tally(std::string&& err) : msg(std::move(err)), count(1)
+		result_tally(std::string&& err) :
+			msg(std::move(err)),
+			count(1)
 		{
 			time = std::chrono::system_clock::now();
 		}
@@ -161,7 +172,7 @@ class executor
 	std::vector<result_tally> vMineResults;
 
 	//More result statistics
-	std::array<size_t, 10> iTopDiff { { } }; //Initialize to zero
+	std::array<size_t, 10> iTopDiff{{}}; //Initialize to zero
 
 	std::chrono::system_clock::time_point tPoolConnTime;
 	size_t iPoolHashes = 0;
@@ -195,4 +206,3 @@ class executor
 
 	inline size_t sec_to_ticks(size_t sec) { return sec * (1000 / iTickTime); }
 };
-
diff --git a/xmrstak/misc/home_dir.hpp b/xmrstak/misc/home_dir.hpp
index 8eb0fa4ea..836c7cc4e 100644
--- a/xmrstak/misc/home_dir.hpp
+++ b/xmrstak/misc/home_dir.hpp
@@ -4,39 +4,40 @@
 
 #ifdef _WIN32
 #include <WinSock2.h>
+// this comment avoid that clang format reorders the includes
 #include <Shlobj.h>
 
 namespace
 {
-	inline std::string get_home()
+inline std::string get_home()
+{
+	char path[MAX_PATH + 1];
+	// get folder "appdata\local"
+	if(SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE))
 	{
-		char path[MAX_PATH + 1];
-		// get folder "appdata\local"
-		if (SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE))
-		{
-			return path;
-		}
-		else
-			return ".";
+		return path;
 	}
-} // namespace anonymous
+	else
+		return ".";
+}
+} // namespace
 
 #else
-#include <unistd.h>
-#include <pwd.h>
 #include <cstdlib>
+#include <pwd.h>
+#include <unistd.h>
 
 namespace
 {
-	inline std::string get_home()
-	{
-		const char *home = ".";
+inline std::string get_home()
+{
+	const char* home = ".";
 
-		if ((home = getenv("HOME")) == nullptr)
-			home = getpwuid(getuid())->pw_dir;
+	if((home = getenv("HOME")) == nullptr)
+		home = getpwuid(getuid())->pw_dir;
 
-		return home;
-	}
-} // namespace anonymous
+	return home;
+}
+} // namespace
 
 #endif // _WIN32
diff --git a/xmrstak/misc/jext.hpp b/xmrstak/misc/jext.hpp
index 9936fa813..421508989 100644
--- a/xmrstak/misc/jext.hpp
+++ b/xmrstak/misc/jext.hpp
@@ -9,7 +9,7 @@ using namespace rapidjson;
 inline const Value* GetObjectMember(const Value& obj, const char* key)
 {
 	Value::ConstMemberIterator itr = obj.FindMember(key);
-	if (itr != obj.MemberEnd())
+	if(itr != obj.MemberEnd())
 		return &itr->value;
 	else
 		return nullptr;
@@ -48,8 +48,8 @@ inline const Value* GetObjectMember(const Value& obj, const char* key)
 
 #elif defined(__NetBSD__)
 
-#include <sys/types.h>
 #include <machine/bswap.h>
+#include <sys/types.h>
 #if defined(__BSWAP_RENAME) && !defined(__bswap_32)
 #define bswap_32(x) bswap32(x)
 #define bswap_64(x) bswap64(x)
diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp
index 47442df09..ea73e6281 100644
--- a/xmrstak/misc/telemetry.cpp
+++ b/xmrstak/misc/telemetry.cpp
@@ -24,9 +24,9 @@
 #include "telemetry.hpp"
 #include "xmrstak/net/msgstruct.hpp"
 
+#include <chrono>
 #include <cmath>
 #include <cstring>
-#include <chrono>
 
 namespace xmrstak
 {
@@ -36,9 +36,8 @@ telemetry::telemetry(size_t iThd)
 	ppHashCounts = new uint64_t*[iThd];
 	ppTimestamps = new uint64_t*[iThd];
 	iBucketTop = new uint32_t[iThd];
-	mtx = new std::mutex[iThd];
 
-	for (size_t i = 0; i < iThd; i++)
+	for(size_t i = 0; i < iThd; i++)
 	{
 		ppHashCounts[i] = new uint64_t[iBucketSize];
 		ppTimestamps[i] = new uint64_t[iBucketSize];
@@ -51,31 +50,29 @@ telemetry::telemetry(size_t iThd)
 double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 {
 
-
 	uint64_t iEarliestHashCnt = 0;
 	uint64_t iEarliestStamp = 0;
 	uint64_t iLatestStamp = 0;
 	uint64_t iLatestHashCnt = 0;
 	bool bHaveFullSet = false;
 
-	std::unique_lock<std::mutex> lk(mtx[iThread]);
 	uint64_t iTimeNow = get_timestamp_ms();
 
 	//Start at 1, buckettop points to next empty
-	for (size_t i = 1; i < iBucketSize; i++)
+	for(size_t i = 1; i < iBucketSize; i++)
 	{
 		size_t idx = (iBucketTop[iThread] - i) & iBucketMask; //overflow expected here
 
-		if (ppTimestamps[iThread][idx] == 0)
+		if(ppTimestamps[iThread][idx] == 0)
 			break; //That means we don't have the data yet
 
-		if (iLatestStamp == 0)
+		if(iLatestStamp == 0)
 		{
 			iLatestStamp = ppTimestamps[iThread][idx];
 			iLatestHashCnt = ppHashCounts[iThread][idx];
 		}
 
-		if (iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec)
+		if(iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec)
 		{
 			bHaveFullSet = true;
 			break; //We are out of the requested time period
@@ -84,13 +81,12 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 		iEarliestStamp = ppTimestamps[iThread][idx];
 		iEarliestHashCnt = ppHashCounts[iThread][idx];
 	}
-	lk.unlock();
 
-	if (!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0)
+	if(!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0)
 		return nan("");
 
 	//Don't think that can happen, but just in case
-	if (iLatestStamp - iEarliestStamp == 0)
+	if(iLatestStamp - iEarliestStamp == 0)
 		return nan("");
 
 	double fHashes, fTime;
@@ -103,7 +99,6 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 
 void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp)
 {
-	std::unique_lock<std::mutex> lk(mtx[iThd]);
 	size_t iTop = iBucketTop[iThd];
 	ppHashCounts[iThd][iTop] = iHashCount;
 	ppTimestamps[iThd][iTop] = iTimestamp;
diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp
index 580565de2..fb87bcd32 100644
--- a/xmrstak/misc/telemetry.hpp
+++ b/xmrstak/misc/telemetry.hpp
@@ -2,20 +2,18 @@
 
 #include <cstdint>
 #include <cstring>
-#include <mutex>
 
 namespace xmrstak
 {
 
 class telemetry
 {
-public:
+  public:
 	telemetry(size_t iThd);
 	void push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp);
 	double calc_telemetry_data(size_t iLastMillisec, size_t iThread);
 
-private:
-	std::mutex* mtx;
+  private:
 	constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations
 	constexpr static size_t iBucketMask = iBucketSize - 1;
 	uint32_t* iBucketTop;
diff --git a/xmrstak/misc/thdq.hpp b/xmrstak/misc/thdq.hpp
index 7a4a5cfe4..2eef30bcf 100644
--- a/xmrstak/misc/thdq.hpp
+++ b/xmrstak/misc/thdq.hpp
@@ -1,31 +1,37 @@
 #pragma once
-
+
+#include <condition_variable>
+#include <mutex>
 #include <queue>
 #include <thread>
-#include <mutex>
-#include <condition_variable>
-
+
 template <typename T>
 class thdq
 {
-public:
+  public:
 	T pop()
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
-		while (queue_.empty()) { cond_.wait(mlock); }
+		while(queue_.empty())
+		{
+			cond_.wait(mlock);
+		}
 		auto item = std::move(queue_.front());
 		queue_.pop();
 		return item;
 	}
-
+
 	void pop(T& item)
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
-		while (queue_.empty()) { cond_.wait(mlock); }
+		while(queue_.empty())
+		{
+			cond_.wait(mlock);
+		}
 		item = queue_.front();
 		queue_.pop();
 	}
-
+
 	void push(const T& item)
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
@@ -33,7 +39,7 @@ class thdq
 		mlock.unlock();
 		cond_.notify_one();
 	}
-
+
 	void push(T&& item)
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
@@ -41,9 +47,9 @@ class thdq
 		mlock.unlock();
 		cond_.notify_one();
 	}
-
-private:
+
+  private:
 	std::queue<T> queue_;
 	std::mutex mutex_;
 	std::condition_variable cond_;
-};
+};
diff --git a/xmrstak/misc/uac.cpp b/xmrstak/misc/uac.cpp
index 9f940933c..0e4f91c7b 100644
--- a/xmrstak/misc/uac.cpp
+++ b/xmrstak/misc/uac.cpp
@@ -1,6 +1,7 @@
 #ifdef _WIN32
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
+#include "xmrstak/jconf.hpp"
 
 #include <string>
 #include <windows.h>
@@ -9,24 +10,24 @@ BOOL IsElevated()
 {
 	BOOL fRet = FALSE;
 	HANDLE hToken = NULL;
-	if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken))
+	if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken))
 	{
 		TOKEN_ELEVATION Elevation;
 		DWORD cbSize = sizeof(TOKEN_ELEVATION);
-		if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
+		if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
 			fRet = Elevation.TokenIsElevated;
 	}
-	if (hToken)
+	if(hToken)
 		CloseHandle(hToken);
 	return fRet;
 }
 
 BOOL SelfElevate(const std::string& my_path, const std::string& params)
 {
-	if (IsElevated())
+	if(IsElevated())
 		return FALSE;
 
-	SHELLEXECUTEINFO shExecInfo = { 0 };
+	SHELLEXECUTEINFO shExecInfo = {0};
 	shExecInfo.cbSize = sizeof(SHELLEXECUTEINFO);
 	shExecInfo.fMask = SEE_MASK_NOCLOSEPROCESS;
 	shExecInfo.hwnd = NULL;
@@ -37,7 +38,7 @@ BOOL SelfElevate(const std::string& my_path, const std::string& params)
 	shExecInfo.nShow = SW_SHOW;
 	shExecInfo.hInstApp = NULL;
 
-	if (!ShellExecuteEx(&shExecInfo))
+	if(!ShellExecuteEx(&shExecInfo))
 		return FALSE;
 
 	// Loiter in the background to make scripting easier
@@ -56,6 +57,9 @@ VOID RequestElevation()
 	if(!xmrstak::params::inst().allowUAC)
 	{
 		printer::inst()->print_msg(L0, "The miner needs to run as administrator, but you passed --noUAC option. Please remove it or set use_slow_memory to always.");
+		if (::jconf::inst()->GetSlowMemSetting() == ::jconf::print_warning)
+			return;
+		
 		win_exit();
 		return;
 	}
@@ -65,13 +69,13 @@ VOID RequestElevation()
 
 BOOL IsWindows10OrNewer()
 {
-    OSVERSIONINFOEX osvi = { 0 };
-    osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
-    osvi.dwMajorVersion = 10;
-    osvi.dwMinorVersion = 0;
-    DWORDLONG dwlConditionMask = 0;
-    VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
-    VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL);
-    return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask);
+	OSVERSIONINFOEX osvi = {0};
+	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+	osvi.dwMajorVersion = 10;
+	osvi.dwMinorVersion = 0;
+	DWORDLONG dwlConditionMask = 0;
+	VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
+	VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL);
+	return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask);
 }
 #endif
diff --git a/xmrstak/misc/utility.cpp b/xmrstak/misc/utility.cpp
index 5177d14c2..bf665bda3 100644
--- a/xmrstak/misc/utility.cpp
+++ b/xmrstak/misc/utility.cpp
@@ -1,21 +1,15 @@
-#include <string>
 #include <algorithm>
-
+#include <string>
 
 namespace xmrstak
 {
-	bool strcmp_i(const std::string& str1, const std::string& str2)
-	{
-		if(str1.size() != str2.size())
-			return false;
-		else
-		return (str1.empty() | str2.empty()) ?
-				false :
-				std::equal(str1.begin(), str1.end(),str2.begin(),
-					[](char c1, char c2)
-					{
-						return ::tolower(c1) == ::tolower(c2);
-					}
-				);
-	}
+bool strcmp_i(const std::string& str1, const std::string& str2)
+{
+	if(str1.size() != str2.size())
+		return false;
+	else
+		return (str1.empty() | str2.empty()) ? false : std::equal(str1.begin(), str1.end(), str2.begin(), [](char c1, char c2) {
+			return ::tolower(c1) == ::tolower(c2);
+		});
+}
 } // namespace xmrstak
diff --git a/xmrstak/misc/utility.hpp b/xmrstak/misc/utility.hpp
index 8f2e99fb8..0eb08993d 100644
--- a/xmrstak/misc/utility.hpp
+++ b/xmrstak/misc/utility.hpp
@@ -4,9 +4,9 @@
 
 namespace xmrstak
 {
-	/** case insensitive string compare
+/** case insensitive string compare
 	 *
 	 * @return true if both strings are equal, else false
 	 */
-	bool strcmp_i(const std::string& str1, const std::string& str2);
+bool strcmp_i(const std::string& str1, const std::string& str2);
 } // namespace xmrstak
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 786b18b4f..f9522962f 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -21,17 +21,17 @@
   *
   */
 
-#include <stdarg.h>
-#include <assert.h>
 #include <algorithm>
+#include <assert.h>
 #include <chrono>
+#include <stdarg.h>
 
 #include "jpsock.hpp"
-#include "socks.hpp"
 #include "socket.hpp"
+#include "socks.hpp"
 
-#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/misc/jext.hpp"
 #include "xmrstak/version.hpp"
 
@@ -45,7 +45,9 @@ struct jpsock::call_rsp
 	std::string sCallErr;
 	uint64_t iMessageId;
 
-	call_rsp(Value* val) : pCallData(val), iMessageId(0)
+	call_rsp(Value* val) :
+		pCallData(val),
+		iMessageId(0)
 	{
 		bHaveResponse = false;
 		iCallId = 0;
@@ -70,7 +72,7 @@ typedef GenericDocument<UTF8<>, MemoryPoolAllocator<>, MemoryPoolAllocator<>> Me
 
 struct jpsock::opaque_private
 {
-	Value  oCallValue;
+	Value oCallValue;
 
 	MemoryPoolAllocator<> callAllocator;
 	MemoryPoolAllocator<> recvAllocator;
@@ -91,12 +93,24 @@ struct jpsock::opaque_private
 struct jpsock::opq_json_val
 {
 	const Value* val;
-	opq_json_val(const Value* val) : val(val) {}
+	opq_json_val(const Value* val) :
+		val(val) {}
 };
 
 jpsock::jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash) :
-	net_addr(sAddr), usr_login(sLogin), usr_rigid(sRigId), usr_pass(sPassword), tls_fp(tls_fp), pool_id(id), pool_weight(pool_weight), pool(dev_pool), nicehash(nicehash),
-	connect_time(0), connect_attempts(0), disconnect_time(0), quiet_close(false)
+	net_addr(sAddr),
+	usr_login(sLogin),
+	usr_rigid(sRigId),
+	usr_pass(sPassword),
+	tls_fp(tls_fp),
+	pool_id(id),
+	pool_weight(pool_weight),
+	pool(dev_pool),
+	nicehash(nicehash),
+	connect_time(0),
+	connect_attempts(0),
+	disconnect_time(0),
+	quiet_close(false)
 {
 	sock_init();
 
@@ -245,7 +259,7 @@ bool jpsock::jpsock_thd_main()
 
 	char buf[iSockBufferSize];
 	size_t datalen = 0;
-	while (true)
+	while(true)
 	{
 		int ret = sck->recv(buf + datalen, sizeof(buf) - datalen);
 
@@ -254,7 +268,7 @@ bool jpsock::jpsock_thd_main()
 
 		datalen += ret;
 
-		if (datalen >= sizeof(buf))
+		if(datalen >= sizeof(buf))
 		{
 			sck->close(false);
 			return set_socket_error("RECEIVE error: data overflow");
@@ -262,12 +276,12 @@ bool jpsock::jpsock_thd_main()
 
 		char* lnend;
 		char* lnstart = buf;
-		while ((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr)
+		while((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr)
 		{
 			lnend++;
 			int lnlen = lnend - lnstart;
 
-			if (!process_line(lnstart, lnlen))
+			if(!process_line(lnstart, lnlen))
 			{
 				sck->close(false);
 				return false;
@@ -278,7 +292,7 @@ bool jpsock::jpsock_thd_main()
 		}
 
 		//Got leftover data? Move it to the front
-		if (datalen > 0 && buf != lnstart)
+		if(datalen > 0 && buf != lnstart)
 			memmove(buf, lnstart, datalen);
 	}
 }
@@ -291,18 +305,18 @@ bool jpsock::process_line(char* line, size_t len)
 	++iMessageCnt;
 
 	/*NULL terminate the line instead of '\n', parsing will add some more NULLs*/
-	line[len-1] = '\0';
+	line[len - 1] = '\0';
 
 	//printf("RECV: %s\n", line);
 
-	if (prv->jsonDoc.ParseInsitu(line).HasParseError())
+	if(prv->jsonDoc.ParseInsitu(line).HasParseError())
 		return set_socket_error("PARSE error: Invalid JSON");
 
-	if (!prv->jsonDoc.IsObject())
+	if(!prv->jsonDoc.IsObject())
 		return set_socket_error("PARSE error: Invalid root");
 
 	const Value* mt;
-	if (prv->jsonDoc.HasMember("method"))
+	if(prv->jsonDoc.HasMember("method"))
 	{
 		mt = GetObjectMember(prv->jsonDoc, "method");
 
@@ -329,7 +343,7 @@ bool jpsock::process_line(char* line, size_t len)
 	{
 		uint64_t iCallId;
 		mt = GetObjectMember(prv->jsonDoc, "id");
-		if (mt == nullptr || !mt->IsUint64())
+		if(mt == nullptr || !mt->IsUint64())
 			return set_socket_error("PARSE error: Protocol error 3");
 
 		iCallId = mt->GetUint64();
@@ -338,10 +352,10 @@ bool jpsock::process_line(char* line, size_t len)
 
 		const char* sError = nullptr;
 		size_t iErrorLen = 0;
-		if (mt == nullptr || mt->IsNull())
+		if(mt == nullptr || mt->IsNull())
 		{
 			/* If there was no error we need a result */
-			if ((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr)
+			if((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr)
 				return set_socket_error("PARSE error: Protocol error 7");
 		}
 		else
@@ -359,7 +373,7 @@ bool jpsock::process_line(char* line, size_t len)
 		}
 
 		std::unique_lock<std::mutex> mlock(call_mutex);
-		if (prv->oCallRsp.pCallData == nullptr)
+		if(prv->oCallRsp.pCallData == nullptr)
 		{
 			/*Server sent us a call reply without us making a call*/
 			mlock.unlock();
@@ -400,7 +414,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 
 	mlock.unlock();
 
-	if (!params->val->IsObject())
+	if(!params->val->IsObject())
 		return set_socket_error("PARSE error: Job error 1");
 
 	const Value *blob, *jobid, *target, *motd, *blk_height;
@@ -410,7 +424,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 	motd = GetObjectMember(*params->val, "motd");
 	blk_height = GetObjectMember(*params->val, "height");
 
-	if (jobid == nullptr || blob == nullptr || target == nullptr ||
+	if(jobid == nullptr || blob == nullptr || target == nullptr ||
 		!jobid->IsString() || !blob->IsString() || !target->IsString())
 	{
 		return set_socket_error("PARSE error: Job error 2");
@@ -421,7 +435,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 		std::unique_lock<std::mutex> lck(motd_mutex);
 		if(motd->GetStringLength() > 0)
 		{
-			pool_motd.resize(motd->GetStringLength()/2 + 1);
+			pool_motd.resize(motd->GetStringLength() / 2 + 1);
 			if(!hex2bin(motd->GetString(), motd->GetStringLength(), (unsigned char*)&pool_motd.front()))
 				pool_motd.clear();
 		}
@@ -429,7 +443,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 			pool_motd.clear();
 	}
 
-	if (jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >=
+	if(jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >=
 		return set_socket_error("PARSE error: Job error 3");
 
 	pool_job oPoolJob;
@@ -437,10 +451,10 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 	const uint32_t iWorkLen = blob->GetStringLength() / 2;
 	oPoolJob.iWorkLen = iWorkLen;
 
-	if (iWorkLen > sizeof(pool_job::bWorkBlob))
+	if(iWorkLen > sizeof(pool_job::bWorkBlob))
 		return set_socket_error("PARSE error: Invalid job length. Are you sure you are mining the correct coin?");
 
-	if (!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob))
+	if(!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob))
 		return set_socket_error("PARSE error: Job error 4");
 
 	// lock reading of oCurrentJob
@@ -479,7 +493,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 		return set_socket_error("PARSE error: Job error 5");
 
 	iJobDiff = t64_to_diff(oPoolJob.iTarget);
-	
+
 	if(blk_height != nullptr && blk_height->IsUint64())
 		oPoolJob.iBlockHeight = bswap_64(blk_height->GetUint64());
 
@@ -589,10 +603,10 @@ bool jpsock::cmd_login()
 	uint64_t messageId = 0;
 
 	/*Normal error conditions (failed login etc..) will end here*/
-	if (!cmd_ret_wait(cmd_buffer, oResult, messageId))
+	if(!cmd_ret_wait(cmd_buffer, oResult, messageId))
 		return false;
 
-	if (!oResult.val->IsObject())
+	if(!oResult.val->IsObject())
 	{
 		set_socket_error("PARSE error: Login protocol error 1");
 		disconnect();
@@ -603,14 +617,14 @@ bool jpsock::cmd_login()
 	const Value* job = GetObjectMember(*oResult.val, "job");
 	const Value* ext = GetObjectMember(*oResult.val, "extensions");
 
-	if (id == nullptr || job == nullptr || !id->IsString())
+	if(id == nullptr || job == nullptr || !id->IsString())
 	{
 		set_socket_error("PARSE error: Login protocol error 2");
 		disconnect();
 		return false;
 	}
 
-	if (id->GetStringLength() >= sizeof(sMinerId))
+	if(id->GetStringLength() >= sizeof(sMinerId))
 	{
 		set_socket_error("PARSE error: Login protocol error 3");
 		disconnect();
@@ -622,7 +636,7 @@ bool jpsock::cmd_login()
 
 	if(ext != nullptr && ext->IsArray())
 	{
-		for(size_t i=0; i < ext->Size(); i++)
+		for(size_t i = 0; i < ext->Size(); i++)
 		{
 			const Value& jextname = ext->GetArray()[i];
 
@@ -693,7 +707,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	sResult[64] = '\0';
 
 	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n",
-		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations,sMemory, sMemAlignBytes);
+		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations, sMemory, sMemAlignBytes);
 
 	uint64_t messageId = 0;
 	opq_json_val oResult(nullptr);
@@ -732,13 +746,13 @@ bool jpsock::get_pool_motd(std::string& strin)
 	return false;
 }
 
-inline unsigned char hf_hex2bin(char c, bool &err)
+inline unsigned char hf_hex2bin(char c, bool& err)
 {
-	if (c >= '0' && c <= '9')
+	if(c >= '0' && c <= '9')
 		return c - '0';
-	else if (c >= 'a' && c <= 'f')
+	else if(c >= 'a' && c <= 'f')
 		return c - 'a' + 0xA;
-	else if (c >= 'A' && c <= 'F')
+	else if(c >= 'A' && c <= 'F')
 		return c - 'A' + 0xA;
 
 	err = true;
@@ -748,17 +762,18 @@ inline unsigned char hf_hex2bin(char c, bool &err)
 bool jpsock::hex2bin(const char* in, unsigned int len, unsigned char* out)
 {
 	bool error = false;
-	for (unsigned int i = 0; i < len; i += 2)
+	for(unsigned int i = 0; i < len; i += 2)
 	{
 		out[i / 2] = (hf_hex2bin(in[i], error) << 4) | hf_hex2bin(in[i + 1], error);
-		if (error) return false;
+		if(error)
+			return false;
 	}
 	return true;
 }
 
 inline char hf_bin2hex(unsigned char c)
 {
-	if (c <= 0x9)
+	if(c <= 0x9)
 		return '0' + c;
 	else
 		return 'a' - 0xA + c;
@@ -766,7 +781,7 @@ inline char hf_bin2hex(unsigned char c)
 
 void jpsock::bin2hex(const unsigned char* in, unsigned int len, char* out)
 {
-	for (unsigned int i = 0; i < len; i++)
+	for(unsigned int i = 0; i < len; i++)
 	{
 		out[i * 2] = hf_bin2hex((in[i] & 0xF0) >> 4);
 		out[i * 2 + 1] = hf_bin2hex(in[i] & 0x0F);
diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp
index 949764813..4ad6ebbbc 100644
--- a/xmrstak/net/jpsock.hpp
+++ b/xmrstak/net/jpsock.hpp
@@ -1,15 +1,14 @@
 #pragma once
 
-#include "xmrstak/backend/iBackend.hpp"
 #include "msgstruct.hpp"
+#include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/jconf.hpp"
 
-#include <mutex>
 #include <atomic>
 #include <condition_variable>
-#include <thread>
+#include <mutex>
 #include <string>
-
+#include <thread>
 
 /* Our pool can have two kinds of errors:
 	- Parsing or connection error
@@ -27,7 +26,7 @@ class base_socket;
 
 class jpsock
 {
-public:
+  public:
 	jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash);
 	~jpsock();
 
@@ -55,7 +54,12 @@ class jpsock
 	inline bool is_logged_in() { return bLoggedIn; }
 	inline bool is_dev_pool() { return pool; }
 	inline size_t get_pool_id() { return pool_id; }
-	inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; }
+	inline bool get_disconnects(size_t& att, size_t& time)
+	{
+		att = connect_attempts;
+		time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0;
+		return pool && usr_login[0];
+	}
 	inline const char* get_pool_addr() { return net_addr.c_str(); }
 	inline const char* get_tls_fp() { return tls_fp.c_str(); }
 	inline const char* get_rigid() { return usr_rigid.c_str(); }
@@ -77,7 +81,7 @@ class jpsock
 	bool set_socket_error_strerr(const char* a);
 	bool set_socket_error_strerr(const char* a, int res);
 
-private:
+  private:
 	std::string net_addr;
 	std::string usr_login;
 	std::string usr_rigid;
@@ -142,4 +146,3 @@ class jpsock
 	uint64_t iMessageCnt = 0;
 	uint64_t iLastMessageId = 0;
 };
-
diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp
index 33980bf42..3cfce3c6f 100644
--- a/xmrstak/net/msgstruct.hpp
+++ b/xmrstak/net/msgstruct.hpp
@@ -2,25 +2,29 @@
 
 #include "xmrstak/backend/cryptonight.hpp"
 
-#include <string>
-#include <string.h>
 #include <assert.h>
+#include <string.h>
+#include <string>
 
 // Structures that we use to pass info between threads constructors are here just to make
 // the stack allocation take up less space, heap is a shared resource that needs locks too of course
 
 struct pool_job
 {
-	char		sJobID[64];
-	uint8_t		bWorkBlob[128];
-	uint64_t	iTarget;
-	uint32_t	iWorkLen;
-	uint32_t	iSavedNonce;
-	uint64_t	iBlockHeight = uint64_t(-1);
-
-	pool_job() : iWorkLen(0), iSavedNonce(0) {}
+	char sJobID[64];
+	uint8_t bWorkBlob[128];
+	uint64_t iTarget;
+	uint32_t iWorkLen;
+	uint32_t iSavedNonce;
+	uint64_t iBlockHeight = uint64_t(-1);
+
+	pool_job() :
+		iWorkLen(0),
+		iSavedNonce(0) {}
 	pool_job(const char* sJobID, uint64_t iTarget, const uint8_t* bWorkBlob, uint32_t iWorkLen) :
-		iTarget(iTarget), iWorkLen(iWorkLen), iSavedNonce(0)
+		iTarget(iTarget),
+		iWorkLen(iWorkLen),
+		iSavedNonce(0)
 	{
 		assert(iWorkLen <= sizeof(pool_job::bWorkBlob));
 		memcpy(this->sJobID, sJobID, sizeof(pool_job::sJobID));
@@ -30,15 +34,17 @@ struct pool_job
 
 struct job_result
 {
-	uint8_t		bResult[32];
-	char		sJobID[64];
-	uint32_t	iNonce;
-	uint32_t	iThreadId;
+	uint8_t bResult[32];
+	char sJobID[64];
+	uint32_t iNonce;
+	uint32_t iThreadId;
 	xmrstak_algo algorithm = {invalid_algo};
 
 	job_result() {}
 	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, const xmrstak_algo& algo) :
-		iNonce(iNonce), iThreadId(iThreadId), algorithm(algo)
+		iNonce(iNonce),
+		iThreadId(iThreadId),
+		algorithm(algo)
 	{
 		memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID));
 		memcpy(this->bResult, bResult, sizeof(job_result::bResult));
@@ -51,8 +57,12 @@ struct sock_err
 	bool silent;
 
 	sock_err() {}
-	sock_err(std::string&& err, bool silent) : sSocketError(std::move(err)), silent(silent) { }
-	sock_err(sock_err&& from) : sSocketError(std::move(from.sSocketError)), silent(from.silent) {}
+	sock_err(std::string&& err, bool silent) :
+		sSocketError(std::move(err)),
+		silent(silent) {}
+	sock_err(sock_err&& from) :
+		sSocketError(std::move(from.sSocketError)),
+		silent(from.silent) {}
 
 	sock_err& operator=(sock_err&& from)
 	{
@@ -62,7 +72,7 @@ struct sock_err
 		return *this;
 	}
 
-	~sock_err() { }
+	~sock_err() {}
 
 	sock_err(sock_err const&) = delete;
 	sock_err& operator=(sock_err const&) = delete;
@@ -73,13 +83,30 @@ struct gpu_res_err
 {
 	size_t idx; // GPU index
 	const char* error_str;
-	gpu_res_err(const char* error_str, size_t idx) : error_str(error_str), idx(idx) {}
+	gpu_res_err(const char* error_str, size_t idx) :
+		error_str(error_str),
+		idx(idx) {}
 };
 
-enum ex_event_name { EV_INVALID_VAL, EV_SOCK_READY, EV_SOCK_ERROR, EV_GPU_RES_ERROR,
-	EV_POOL_HAVE_JOB, EV_MINER_HAVE_RESULT, EV_PERF_TICK, EV_EVAL_POOL_CHOICE,
-	EV_USR_HASHRATE, EV_USR_RESULTS, EV_USR_CONNSTAT, EV_HASHRATE_LOOP,
-	EV_HTML_HASHRATE, EV_HTML_RESULTS, EV_HTML_CONNSTAT, EV_HTML_JSON };
+enum ex_event_name
+{
+	EV_INVALID_VAL,
+	EV_SOCK_READY,
+	EV_SOCK_ERROR,
+	EV_GPU_RES_ERROR,
+	EV_POOL_HAVE_JOB,
+	EV_MINER_HAVE_RESULT,
+	EV_PERF_TICK,
+	EV_EVAL_POOL_CHOICE,
+	EV_USR_HASHRATE,
+	EV_USR_RESULTS,
+	EV_USR_CONNSTAT,
+	EV_HASHRATE_LOOP,
+	EV_HTML_HASHRATE,
+	EV_HTML_RESULTS,
+	EV_HTML_CONNSTAT,
+	EV_HTML_JSON
+};
 
 /*
    This is how I learned to stop worrying and love c++11 =).
@@ -96,20 +123,37 @@ struct ex_event
 	ex_event_name iName;
 	size_t iPoolId;
 
-	union
-	{
+	union {
 		pool_job oPoolJob;
 		job_result oJobResult;
 		sock_err oSocketError;
 		gpu_res_err oGpuError;
 	};
 
-	ex_event() { iName = EV_INVALID_VAL; iPoolId = 0;}
-	ex_event(const char* gpu_err, size_t gpu_idx, size_t id) : iName(EV_GPU_RES_ERROR), iPoolId(id), oGpuError(gpu_err, gpu_idx) {}
-	ex_event(std::string&& err, bool silent, size_t id) : iName(EV_SOCK_ERROR), iPoolId(id), oSocketError(std::move(err), silent) { }
-	ex_event(job_result dat, size_t id) : iName(EV_MINER_HAVE_RESULT), iPoolId(id), oJobResult(dat) {}
-	ex_event(pool_job dat, size_t id) : iName(EV_POOL_HAVE_JOB), iPoolId(id), oPoolJob(dat) {}
-	ex_event(ex_event_name ev, size_t id = 0) : iName(ev), iPoolId(id) {}
+	ex_event()
+	{
+		iName = EV_INVALID_VAL;
+		iPoolId = 0;
+	}
+	ex_event(const char* gpu_err, size_t gpu_idx, size_t id) :
+		iName(EV_GPU_RES_ERROR),
+		iPoolId(id),
+		oGpuError(gpu_err, gpu_idx) {}
+	ex_event(std::string&& err, bool silent, size_t id) :
+		iName(EV_SOCK_ERROR),
+		iPoolId(id),
+		oSocketError(std::move(err), silent) {}
+	ex_event(job_result dat, size_t id) :
+		iName(EV_MINER_HAVE_RESULT),
+		iPoolId(id),
+		oJobResult(dat) {}
+	ex_event(pool_job dat, size_t id) :
+		iName(EV_POOL_HAVE_JOB),
+		iPoolId(id),
+		oPoolJob(dat) {}
+	ex_event(ex_event_name ev, size_t id = 0) :
+		iName(ev),
+		iPoolId(id) {}
 
 	// Delete the copy operators to make sure we are moving only what is needed
 	ex_event(ex_event const&) = delete;
@@ -123,7 +167,7 @@ struct ex_event
 		switch(iName)
 		{
 		case EV_SOCK_ERROR:
-			new (&oSocketError) sock_err(std::move(from.oSocketError));
+			new(&oSocketError) sock_err(std::move(from.oSocketError));
 			break;
 		case EV_MINER_HAVE_RESULT:
 			oJobResult = from.oJobResult;
@@ -151,7 +195,7 @@ struct ex_event
 		switch(iName)
 		{
 		case EV_SOCK_ERROR:
-			new (&oSocketError) sock_err();
+			new(&oSocketError) sock_err();
 			oSocketError = std::move(from.oSocketError);
 			break;
 		case EV_MINER_HAVE_RESULT:
diff --git a/xmrstak/net/socket.cpp b/xmrstak/net/socket.cpp
index 6fcb454cd..6a6abac15 100644
--- a/xmrstak/net/socket.cpp
+++ b/xmrstak/net/socket.cpp
@@ -28,16 +28,17 @@
 #include "xmrstak/misc/executor.hpp"
 
 #ifndef CONF_NO_TLS
-#include <openssl/ssl.h>
 #include <openssl/err.h>
 #include <openssl/opensslconf.h>
+#include <openssl/ssl.h>
 
 #ifndef OPENSSL_THREADS
 #error OpenSSL was compiled without thread support
 #endif
 #endif
 
-plain_socket::plain_socket(jpsock* err_callback) : pCallback(err_callback)
+plain_socket::plain_socket(jpsock* err_callback) :
+	pCallback(err_callback)
 {
 	hSocket = INVALID_SOCKET;
 	pSockAddr = nullptr;
@@ -50,58 +51,58 @@ bool plain_socket::set_hostname(const char* sAddr)
 
 	sock_closed = false;
 	size_t ln = strlen(sAddr);
-	if (ln >= sizeof(sAddrMb))
+	if(ln >= sizeof(sAddrMb))
 		return pCallback->set_socket_error("CONNECT error: Pool address overflow.");
 
 	memcpy(sAddrMb, sAddr, ln);
 	sAddrMb[ln] = '\0';
 
-	if ((sTmp = strstr(sAddrMb, "//")) != nullptr)
+	if((sTmp = strstr(sAddrMb, "//")) != nullptr)
 	{
 		sTmp += 2;
 		memmove(sAddrMb, sTmp, strlen(sTmp) + 1);
 	}
 
-	if ((sPort = strchr(sAddrMb, ':')) == nullptr)
+	if((sPort = strchr(sAddrMb, ':')) == nullptr)
 		return pCallback->set_socket_error("CONNECT error: Pool port number not specified, please use format <hostname>:<port>.");
 
 	sPort[0] = '\0';
 	sPort++;
 
-	addrinfo hints = { 0 };
+	addrinfo hints = {0};
 	hints.ai_family = AF_UNSPEC;
 	hints.ai_socktype = SOCK_STREAM;
 	hints.ai_protocol = IPPROTO_TCP;
 
 	pAddrRoot = nullptr;
 	int err;
-	if ((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0)
+	if((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0)
 		return pCallback->set_socket_error_strerr("CONNECT error: GetAddrInfo: ", err);
 
-	addrinfo *ptr = pAddrRoot;
+	addrinfo* ptr = pAddrRoot;
 	std::vector<addrinfo*> ipv4;
 	std::vector<addrinfo*> ipv6;
 
-	while (ptr != nullptr)
+	while(ptr != nullptr)
 	{
-		if (ptr->ai_family == AF_INET)
+		if(ptr->ai_family == AF_INET)
 			ipv4.push_back(ptr);
-		if (ptr->ai_family == AF_INET6)
+		if(ptr->ai_family == AF_INET6)
 			ipv6.push_back(ptr);
 		ptr = ptr->ai_next;
 	}
 
-	if (ipv4.empty() && ipv6.empty())
+	if(ipv4.empty() && ipv6.empty())
 	{
 		freeaddrinfo(pAddrRoot);
 		pAddrRoot = nullptr;
 		return pCallback->set_socket_error("CONNECT error: I found some DNS records but no IPv4 or IPv6 addresses.");
 	}
-	else if (!ipv4.empty() && ipv6.empty())
+	else if(!ipv4.empty() && ipv6.empty())
 		pSockAddr = ipv4[rand() % ipv4.size()];
-	else if (ipv4.empty() && !ipv6.empty())
+	else if(ipv4.empty() && !ipv6.empty())
 		pSockAddr = ipv6[rand() % ipv6.size()];
-	else if (!ipv4.empty() && !ipv6.empty())
+	else if(!ipv4.empty() && !ipv6.empty())
 	{
 		if(jconf::inst()->PreferIpv4())
 			pSockAddr = ipv4[rand() % ipv4.size()];
@@ -111,7 +112,7 @@ bool plain_socket::set_hostname(const char* sAddr)
 
 	hSocket = socket(pSockAddr->ai_family, pSockAddr->ai_socktype, pSockAddr->ai_protocol);
 
-	if (hSocket == INVALID_SOCKET)
+	if(hSocket == INVALID_SOCKET)
 	{
 		freeaddrinfo(pAddrRoot);
 		pAddrRoot = nullptr;
@@ -120,7 +121,7 @@ bool plain_socket::set_hostname(const char* sAddr)
 
 	int flag = 1;
 	/* If it fails, it fails, we won't loose too much sleep over it */
-	setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
+	setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int));
 
 	return true;
 }
@@ -133,7 +134,7 @@ bool plain_socket::connect()
 	freeaddrinfo(pAddrRoot);
 	pAddrRoot = nullptr;
 
-	if (ret != 0)
+	if(ret != 0)
 		return pCallback->set_socket_error_strerr("CONNECT error: ");
 	else
 		return true;
@@ -158,10 +159,10 @@ bool plain_socket::send(const char* buf)
 {
 	size_t pos = 0;
 	size_t slen = strlen(buf);
-	while (pos != slen)
+	while(pos != slen)
 	{
 		int ret = ::send(hSocket, buf + pos, slen - pos, 0);
-		if (ret == SOCKET_ERROR)
+		if(ret == SOCKET_ERROR)
 		{
 			pCallback->set_socket_error_strerr("SEND error: ");
 			return false;
@@ -184,7 +185,8 @@ void plain_socket::close(bool free)
 }
 
 #ifndef CONF_NO_TLS
-tls_socket::tls_socket(jpsock* err_callback) : pCallback(err_callback)
+tls_socket::tls_socket(jpsock* err_callback) :
+	pCallback(err_callback)
 {
 }
 
@@ -193,7 +195,7 @@ void tls_socket::print_error()
 	BIO* err_bio = BIO_new(BIO_s_mem());
 	ERR_print_errors(err_bio);
 
-	char *buf = nullptr;
+	char* buf = nullptr;
 	size_t len = BIO_get_mem_data(err_bio, &buf);
 
 	if(buf == nullptr)
@@ -247,7 +249,7 @@ bool tls_socket::set_hostname(const char* sAddr)
 
 	int flag = 1;
 	/* If it fails, it fails, we won't loose too much sleep over it */
-	setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
+	setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int));
 
 	if(BIO_set_conn_hostname(bio, sAddr) != 1)
 	{
@@ -327,7 +329,7 @@ bool tls_socket::connect()
 	BIO_flush(b64);
 
 	const char* conf_md = pCallback->get_tls_fp();
-	char *b64_md = nullptr;
+	char* b64_md = nullptr;
 	size_t b64_len = BIO_get_mem_data(bmem, &b64_md);
 
 	if(strlen(conf_md) == 0)
@@ -393,4 +395,3 @@ void tls_socket::close(bool free)
 	}
 }
 #endif
-
diff --git a/xmrstak/net/socket.hpp b/xmrstak/net/socket.hpp
index b09142d56..88b665adf 100644
--- a/xmrstak/net/socket.hpp
+++ b/xmrstak/net/socket.hpp
@@ -1,26 +1,26 @@
 #pragma once
 
-#include <atomic>
 #include "socks.hpp"
+#include <atomic>
 
 class jpsock;
 
 class base_socket
 {
-public:
+  public:
 	virtual bool set_hostname(const char* sAddr) = 0;
 	virtual bool connect() = 0;
 	virtual int recv(char* buf, unsigned int len) = 0;
 	virtual bool send(const char* buf) = 0;
 	virtual void close(bool free) = 0;
 
-protected:
+  protected:
 	std::atomic<bool> sock_closed;
 };
 
 class plain_socket : public base_socket
 {
-public:
+  public:
 	plain_socket(jpsock* err_callback);
 
 	bool set_hostname(const char* sAddr);
@@ -29,10 +29,10 @@ class plain_socket : public base_socket
 	bool send(const char* buf);
 	void close(bool free);
 
-private:
+  private:
 	jpsock* pCallback;
-	addrinfo *pSockAddr;
-	addrinfo *pAddrRoot;
+	addrinfo* pSockAddr;
+	addrinfo* pAddrRoot;
 	SOCKET hSocket;
 };
 
@@ -42,7 +42,7 @@ typedef struct ssl_st SSL;
 
 class tls_socket : public base_socket
 {
-public:
+  public:
 	tls_socket(jpsock* err_callback);
 
 	bool set_hostname(const char* sAddr);
@@ -51,7 +51,7 @@ class tls_socket : public base_socket
 	bool send(const char* buf);
 	void close(bool free);
 
-private:
+  private:
 	void init_ctx();
 	void print_error();
 
diff --git a/xmrstak/net/socks.hpp b/xmrstak/net/socks.hpp
index 86749e527..600e4d276 100644
--- a/xmrstak/net/socks.hpp
+++ b/xmrstak/net/socks.hpp
@@ -2,18 +2,19 @@
 
 #ifdef _WIN32
 #ifndef _WIN32_WINNT
-#define _WIN32_WINNT 0x0601  /* Windows 7 */
+#define _WIN32_WINNT 0x0601 /* Windows 7 */
 #endif
+
 #include <winsock2.h>
 #include <ws2tcpip.h>
+// this comment disable clang include reordering for windows.h
 #include <windows.h>
 
-
 inline void sock_init()
 {
 	static bool bWSAInit = false;
 
-	if (!bWSAInit)
+	if(!bWSAInit)
 	{
 		WSADATA wsaData;
 		WSAStartup(MAKEWORD(2, 2), &wsaData);
@@ -56,20 +57,20 @@ inline const char* sock_gai_strerror(int err, char* buf, size_t len)
 #else
 
 /* Assume that any non-Windows platform uses POSIX-style sockets instead. */
-#include <sys/socket.h>
 #include <arpa/inet.h>
-#include <netdb.h>  /* Needed for getaddrinfo() and freeaddrinfo() */
-#include <unistd.h> /* Needed for close() */
 #include <errno.h>
-#include <string.h>
+#include <netdb.h>		/* Needed for getaddrinfo() and freeaddrinfo() */
 #include <netinet/in.h> /* Needed for IPPROTO_TCP */
 #include <netinet/tcp.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h> /* Needed for close() */
 
 inline void sock_init() {}
 typedef int SOCKET;
 
-#define INVALID_SOCKET  (-1)
-#define SOCKET_ERROR    (-1)
+#define INVALID_SOCKET (-1)
+#define SOCKET_ERROR (-1)
 
 inline void sock_close(SOCKET s)
 {
diff --git a/xmrstak/params.hpp b/xmrstak/params.hpp
index 146aaa42f..5bfbac381 100644
--- a/xmrstak/params.hpp
+++ b/xmrstak/params.hpp
@@ -15,7 +15,11 @@ struct params
 	{
 		auto& env = environment::inst();
 		if(env.pParams == nullptr)
-			env.pParams = new params;
+		{
+			std::unique_lock<std::mutex> lck(env.update);
+			if(env.pParams == nullptr)
+				env.pParams = new params;
+		}
 		return *env.pParams;
 	}
 
@@ -25,6 +29,8 @@ struct params
 	bool AMDCache;
 	bool useNVIDIA;
 	bool useCPU;
+	std::string amdGpus;
+	std::string nvidiaGpus;
 	// user selected OpenCL vendor
 	std::string openCLVendor;
 
@@ -50,6 +56,9 @@ struct params
 	std::string configFileNVIDIA;
 	std::string configFileCPU;
 
+	std::string outputFile;
+	int h_print_time = -1;
+
 	bool allowUAC = true;
 	std::string minerArg0;
 	std::string minerArgs;
@@ -73,8 +82,8 @@ struct params
 		rootAMDCacheDir(get_home() + "/.openclcache/"),
 		configFileCPU("cpu.txt"),
 		configFileNVIDIA("nvidia.txt")
-	{}
-
+	{
+	}
 };
 
 } // namespace xmrstak
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index ea3a276aa..eb57a3f04 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -19,19 +19,17 @@ POOLCONF],
 /*
  * Currency to mine. Supported values:
  *
- *    aeon7 (use this for Aeon's new PoW)
  *    bbscoin (automatic switch with block version 3 to cryptonight_v7)
  *    bittube (uses cryptonight_bittube2 algorithm)
- *    freehaven
  *    graft
  *    haven (automatic switch with block version 3 to cryptonight_haven)
- *    intense
+ *    lethean
  *    masari
- *    monero (use this to support Monero's Oct 2018 fork)
  *    qrl - Quantum Resistant Ledger
  *    ryo
  *    turtlecoin
  *    plenteum
+ *    torque
  *    xcash
  *
  * Native algorithms which do not depend on any block versions:
@@ -49,7 +47,7 @@ POOLCONF],
  *    cryptonight_v7
  *    cryptonight_v8
  *    cryptonight_v8_double (used by xcash)
- *    cryptonight_v8_half (used by masari and stellite)
+ *    cryptonight_v8_half (used by masari and torque)
  *    cryptonight_v8_reversewaltz (used by graft)
  *    cryptonight_v8_zelerius
  *    # 4MiB scratchpad memory
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index c9fa175ac..51c4e4e63 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -2,7 +2,9 @@
 
 //! git will put "#define GIT_ARCHIVE 1" on the next line inside archives. $Format:%n#define GIT_ARCHIVE 1$
 #if defined(GIT_ARCHIVE) && !defined(GIT_COMMIT_HASH)
-#define GIT_COMMIT_HASH $Format:%h$
+#define GIT_COMMIT_HASH \
+	$Format:            \
+	% h$
 #endif
 
 #ifndef GIT_COMMIT_HASH
@@ -18,7 +20,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.10.2"
+#define XMR_STAK_VERSION "2.10.8"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"
@@ -35,10 +37,10 @@
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
 
-#define VERSION_LONG  XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/"
+#define VERSION_LONG XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/"
 #define VERSION_SHORT XMR_STAK_NAME " " XMR_STAK_VERSION " " XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH)
 #define VERSION_HTML "v" XMR_STAK_VERSION "-" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH)
 
-const char ver_long[]  = VERSION_LONG;
+const char ver_long[] = VERSION_LONG;
 const char ver_short[] = VERSION_SHORT;
 const char ver_html[] = VERSION_HTML;
diff --git a/xmrstak/version.hpp b/xmrstak/version.hpp
index cdf82f30d..85905f01c 100644
--- a/xmrstak/version.hpp
+++ b/xmrstak/version.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
+#include "donate-level.hpp"
 #include <inttypes.h>
 #include <string>
-#include "donate-level.hpp"
 
 extern const char ver_long[];
 extern const char ver_short[];
@@ -10,7 +10,7 @@ extern const char ver_html[];
 
 inline std::string get_version_str()
 {
-	return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000)) ;
+	return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000));
 }
 
 inline std::string get_version_str_short()