diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 000000000..2349ab411 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,16 @@ +version: 2 +jobs: + build: + machine: true + working_directory: /home/circleci/project + + steps: + - checkout + + - run: + name: Build the docker image + command: docker build -t xmr-stak:$CIRCLE_BRANCH /home/circleci/project + + - run: + name: Run a benchmark with Monero V8 + command: docker run --rm -t xmr-stak:$CIRCLE_BRANCH /usr/local/bin/xmr-benchmark.sh \ No newline at end of file diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..25ba43d61 --- /dev/null +++ b/.clang-format @@ -0,0 +1,14 @@ +IndentWidth: 4 +TabWidth: 4 +ColumnLimit: 0 +BreakBeforeBraces: Allman +AllowShortIfStatementsOnASingleLine: false +IndentCaseLabels: false +SpaceBeforeParens: Never +UseTab: Always +AlignAfterOpenBracket: DontAlign +PointerBindsToType: true +BreakConstructorInitializers: AfterColon +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index 8451f3289..000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,30 +0,0 @@ -Please provide as much as possible information to reproduce the issue. - -# Basic information - - Type of the CPU. - - Type of the GPU (if you try to miner with the GPU). - -# Compile issues - - Which OS do you use? - ``` - add **all** commands you used and the **full** compile output here - ``` - ``` - run `cmake -LA .` in the build folder and add the output here - ``` - -# Issue with the execution - - Do you compiled the miner by our own? - ``` - run `./xmr-stak --version-long` and add the output here - ``` - -# AMD OpenCl issue - - ``` - run `clinfo` and add the output here - ``` - -# Stability issue - - Is the CPU or GPU overclocked? - - Is the Main memory of the CPU or GPU undervolted? diff --git a/.github/ISSUE_TEMPLATE/compile_bug_report.md b/.github/ISSUE_TEMPLATE/compile_bug_report.md new file mode 100644 index 000000000..899ad941f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/compile_bug_report.md @@ -0,0 +1,35 @@ +--- +name: Compile bug report +about: You have an issue to compile xmr-stak. + +--- + +`...` are the placeholder for your answers. Please answer each question! + + +**Describe the bug** +A clear and concise description of what the bug is. + +**Which operating system do you use? ** + +``` +... +``` + +**To Reproduce** +``` +# Please post all commands and the output. +... +``` + +**Additional information.** + +``` +# run `cmake -LA .` in the build folder and add the output here +... +``` + +**Feel free to add more information.** +``` +... +``` diff --git a/.github/ISSUE_TEMPLATE/execution_bug_report.md b/.github/ISSUE_TEMPLATE/execution_bug_report.md new file mode 100644 index 000000000..44ac89bf1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/execution_bug_report.md @@ -0,0 +1,7 @@ +--- +name: Execution bug report +about: You have an issue to execute xmr-stak. + +--- + +**Most execution issues are caused by driver problems. Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to ask for help instead of opening an issue here.** diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..90f5e4f3d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,7 @@ +--- +name: Feature request +about: Suggest an idea for xmr-stak. + +--- + +**Please explain the feature as good as possible.** diff --git a/.github/ISSUE_TEMPLATE/tuning_help.md b/.github/ISSUE_TEMPLATE/tuning_help.md new file mode 100644 index 000000000..40dedef05 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/tuning_help.md @@ -0,0 +1,7 @@ +--- +name: Need help for optimization. +about: You need help to optimize your setup. + +--- + +**Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to discuss optimizations.** diff --git a/CMakeLists.txt b/CMakeLists.txt index 41e993eee..795829e66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,13 +44,13 @@ endif() set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "${BUILD_TYPE}") set(XMR-STAK_COMPILE "native" CACHE STRING "select CPU compute architecture") -set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic") +set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic;dev_release") if(XMR-STAK_COMPILE STREQUAL "native") if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(CMAKE_CXX_FLAGS "-march=native -mtune=native ${CMAKE_CXX_FLAGS}") set(CMAKE_C_FLAGS "-march=native -mtune=native ${CMAKE_C_FLAGS}") endif() -elseif(XMR-STAK_COMPILE STREQUAL "generic") +elseif(XMR-STAK_COMPILE STREQUAL "generic" OR XMR-STAK_COMPILE STREQUAL "dev_release") add_definitions("-DCONF_ENFORCE_OpenCL_1_2=1") else() message(FATAL_ERROR "XMR-STAK_COMPILE is set to an unknown value '${XMR-STAK_COMPILE}'") @@ -496,6 +496,10 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}") endif() +if(XMR-STAK_COMPILE STREQUAL "dev_release") + add_definitions(-DXMRSTAK_DEV_RELEASE) +endif() + # activate static libgcc and libstdc++ linking if(CMAKE_LINK_STATIC) set(BUILD_SHARED_LIBRARIES OFF) @@ -586,7 +590,16 @@ if(CUDA_FOUND) ) endif() - set(CUDA_LIBRARIES ${CUDA_LIB} ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES}) + set(CUDA_LIBRARIES ${CUDA_LIB} ${CUDA_LIBRARIES}) + if(XMR-STAK_COMPILE STREQUAL "dev_release") + # do not link nvrtc for linux binaries, cn-r will be disabled + if(WIN32) + set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB}) + endif() + else() + set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB}) + endif() + target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES}) target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm) endif() diff --git a/README.md b/README.md index c890da1a5..2e2eb61fa 100644 --- a/README.md +++ b/README.md @@ -1,102 +1,10 @@ -###### fireice-uk's and psychocrypt's -# XMR-Stak - Cryptonight All-in-One Mining Software - -XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins. - -## HTML reports - - -## Video setup guide on Windows - -[](https://youtu.be/YNMa8NplWus) -###### Video by Crypto Sewer - -## Overview -* [Features](#features) -* [Supported altcoins](#supported-altcoins) -* [Download](#download) -* [Usage](doc/usage.md) -* [HowTo Compile](doc/compile.md) -* [FAQ](doc/FAQ.md) -* [Developer Donation](#default-developer-donation) -* [Developer PGP Key's](doc/pgp_keys.md) - -## Features - -- support all common backends (CPU/x86, AMD-GPU and NVIDIA-GPU) -- support all common OS (Linux, Windows and macOS) -- supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON) -- easy to use - - guided start (no need to edit a config file for the first start) - - auto-configuration for each backend -- open source software (GPLv3) -- TLS support -- [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton) -- [JSON API for monitoring](doc/usage.md#html-and-json-api-report-configuraton) - -## Supported altcoins - -Besides [Monero](https://getmonero.org), following coins can be mined using this miner: - -- [Aeon](http://www.aeon.cash) -- [BBSCoin](https://www.bbscoin.xyz) -- [BitTube](https://coin.bit.tube/) -- [Conceal](https://conceal.network) -- [Graft](https://www.graft.network) -- [Haven](https://havenprotocol.com) -- [Lethean](https://lethean.io) -- [Masari](https://getmasari.org) -- [Plenteum](https://www.plenteum.com/) -- [QRL](https://theqrl.org) -- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo** -- [Stellite](https://stellite.cash/) -- [TurtleCoin](https://turtlecoin.lol) -- [Zelerius](https://zelerius.org/) -- [X-CASH](https://x-network.io/) - -Ryo currency is a way for us to implement the ideas that we were unable to in -Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details. - -If your prefered coin is not listed, you can choose one of the following algorithms: -- 256Kib scratchpad memory - - cryptonight_turtle -- 1MiB scratchpad memory - - cryptonight_lite - - cryptonight_lite_v7 - - cryptonight_lite_v7_xor (algorithm used by ipbc) -- 2MiB scratchpad memory - - cryptonight - - cryptonight_gpu (for Ryo's 14th of Feb fork) - - cryptonight_masari (used in 2018) - - cryptonight_v7 - - cryptonight_v7_stellite - - cryptonight_v8 - - cryptonight_v8_double (used by X-CASH) - - cryptonight_v8_half (used by masari and stellite) - - cryptonight_v8_reversewaltz (used by graft) - - cryptonight_v8_zelerius -- 4MiB scratchpad memory - - cryptonight_haven - - cryptonight_heavy - -Please note, this list is not complete and is not an endorsement. - -## Download - -You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases). - -## Default Developer Donation - -By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries. - -If you want to donate directly to support further development, here is my wallet - -fireice-uk: -``` -4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk -``` - -psychocrypt: -``` -45tcqnJMgd3VqeTznNotiNj4G9PQoK67TGRiHyj6EYSZ31NUbAfs9XdiU5squmZb717iHJLxZv3KfEw8jCYGL5wa19yrVCn -``` + + + +

+

+ + + + +
\ No newline at end of file diff --git a/doc/FAQ.md b/doc/FAQ.md index f744e3d24..b78ac15cb 100644 --- a/doc/FAQ.md +++ b/doc/FAQ.md @@ -1,104 +1,27 @@ # FAQ +To improve our support we created [Xmr-Stak forum](https://www.reddit.com/r/XmrStak). Check it out if you have a problem, or you are looking for most up to date config for your card and [guides](https://www.reddit.com/r/XmrStak/wiki/index). + ## Content Overview -* ["Obtaining SeLockMemoryPrivilege failed."](#obtaining-selockmemoryprivilege-failed) -* [VirtualAlloc failed](#virtualalloc-failed) -* [Error msvcp140.dll and vcruntime140.dll not available](#error-msvcp140dll-and-vcruntime140dll-not-available) -* [Error: MEMORY ALLOC FAILED: mmap failed](#error-memory-alloc-failed-mmap-failed) -* [Illegal instruction (core dumped)](#illegal-instruction) * [Virus Protection Alert](#virus-protection-alert) * [Change Currency to Mine](#change-currency-to-mine) * [How can I mine Monero](#how-can-i-mine-monero) * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed) -* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error) - -## "Obtaining SeLockMemoryPrivilege failed." - -For professional versions of Windows see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx). -Make sure to reboot afterwards! - -For Windows 7/10 Home: - -1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore any incompatibility warning during installation. - -2) Open cmd or PowerShell as an administrator. - -3) Use `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` where %USERNAME% is the user that will be running the program. - -4) Reboot. - -Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791 - -*Warning: Do not download ntrights.exe from any other site other than the offical Microsoft download page.* - -## VirtualAlloc failed - -If you set up the user rights properly ([see above](https://github.com/fireice-uk/xmr-stak/blob/master/doc/FAQ.md#selockmemoryprivilege-failed)), and your system has 4-8GB of RAM (50%+ use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation. - -If that happens, disable all auto-starting applications and run the miner after a reboot. - -## Error msvcp140.dll and vcruntime140.dll not available - -Download and install this [runtime package](https://go.microsoft.com/fwlink/?LinkId=746572) from Microsoft. *Warning: Do NOT use "missing dll" sites - dll's are exe files with another name, and it is a fairly safe bet that any dll on a shady site like that will be trojaned. Please download offical runtimes from Microsoft above.* - - -## Error: MEMORY ALLOC FAILED: mmap failed - -On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`). - -Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time). Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work). - -To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults): - - $ /sbin/sysctl vm.nr_hugepages ; ulimit -l - vm.nr_hugepages = 0 - 16 - -To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`: - - vm.nr_hugepages=128 - -You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot). In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits) - -To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`: - - * - memlock 262144 - root - memlock 262144 - -You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session). -Recheck after completing these steps to validate: - - $ /sbin/sysctl vm.nr_hugepages ; ulimit -l - vm.nr_hugepages = 128 - 262144 - -You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly). - -## Illegal Instruction - -This typically means you are trying to run it on a CPU that does not have [AES](https://en.wikipedia.org/wiki/AES_instruction_set). This only happens on older version of miner, new version gives better error message (but still wont' work since your CPU doesn't support the required instructions). - -## Virus Protection Alert +### Virus Protection Alert Some virus protection software flags the miner binary as *malware*. This is a false positive — the software does not contain any malware (and since it is open source, you can verify that yourself!) If your antivirus software flags **xmr-stak**, it will likely move it to its quarantine area. You may have to whitelist **xmr-stak** in your antivirus. -## Change Currency to Mine - +### Change Currency to Mine If the miner is compiled for Monero and Aeon than you can change - the value `currency` in the config *or* - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7` - run `xmr-stak --help` to see all supported currencies and algorithms -## How can I mine Monero - +### How can I mine Monero Set the value `currency` in `pools.txt` to `monero`. -## Which currency must be chosen if my fork coin is not listed - +### Which currency must be chosen if my fork coin is not listed If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm. Select one of these generic coin algorithms. -## Internal compiler error - -Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04). diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 000000000..7a1f13288 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,132 @@ + + + + + + + +
+ + + + + + + + + +
+ + + + + + + + + +
+ +## Introduction +XMR-Stak is a universal open source stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used for mining various crypto currencies: Ryo, Graft, Bittube, Conceal, Haven and many more Cryptonight coins. + +## Features overview +[](#) + +## Supported coins and algorithms +Xmr-Stak supports various variants of Cryptonight algorithm. Use one of the following options (type this coin alias in either `pool.txt` config file or on startup configuration under `"currency"` parameter and miner will pick it's variant of Cryptonight algorithm for mining): + +| | | | +| --- | --- | --- | +| [BitTube](https://coin.bit.tube/) | [Plenteum](https://www.plenteum.com/) | | +| [Conceal](https://conceal.network) | [QRL](https://theqrl.org) | | +| [Graft](https://www.graft.network) | [Ryo](https://ryo-currency.com) | **Atom Wallet Solo mining mode is sponsored by [RYO](https://ryo-currency.com/)** | +| [Haven](https://havenprotocol.com) | [X-CASH](https://x-network.io/) | | +| [Lethean](https://lethean.io) | [Zelerius](https://zelerius.org/) | | +| [Masari](https://getmasari.org) | | | + + +**[Ryo Currency](https://ryo-currency.com)** - is a way for us to implement the ideas that we were unable to in +Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details. + +If your preferred coin is not listed, you can choose one of the following mining algorithms: + +| 256 KiB scratchpad memory | 1 MiB scratchpad memory | 2 MiB scratchpad memory | 4 MiB scratchpad memory | +| --- | --- | --- | --- | +| cryptonight_turtle | cryptonight_lite | cryptonight | cryptonight_bittube2 | +| --- | cryptonight_lite_v7 | cryptonight_gpu | cryptonight_haven | +| --- | --- | cryptonight_conceal | cryptonight_heavy | +| --- | --- | cryptonight_r | --- | +| --- | --- | cryptonight_masari (used in 2018) | --- | +| --- | --- | cryptonight_v8_reversewaltz | --- | +| --- | --- | cryptonight_v7 | --- | +| --- | --- | cryptonight_v8 | --- | +| --- | --- | cryptonight_v8_half (used by masari) | --- | +| --- | --- | cryptonight_v8_double (used by X-CASH) | --- | +| --- | --- | cryptonight_v8_zelerius | --- | + +Please note, this list is not complete and is not an endorsement. + + +## Get Miner +Please note that code is developed on the [dev branch](https://github.com/fireice-uk/xmr-stak/commits/dev), if you want to check out the latest updates, before they are merged on main branch, please refer there. Master branch will always point to a version that we consider stable, so you can download the code by simply typing `git clone https://github.com/fireice-uk/xmr-stak.git` + +Also you can find the latest releases and precompiled binaries on GitHub under [releases](https://github.com/fireice-uk/xmr-stak/releases/latest) section. + +If you want to compile the miner from source files, navigate to ["how to compile"](compile/compile.md) section of docs or [xmr-stak forum](https://www.reddit.com/r/XmrStak/wiki/guides/startup) where you will find the latest step-by-step instructions. + + +## Start Mining +Miner has 2 ways of initial configuring: simple and advanced. The simple method will prompt user with minimum information. Required answers are y , (or yes), n , (or no): + +#### Simple setup: +* `Use simple setup method?` y +* `Please enter the currency that you want to mine:` Enter currency or mining algorithm +* `Enter pool address (pool address:port):` Enter pool connection address:port +* `Username (wallet address or pool login):` Enter wallet address +* `Password (mostly empty or x):` press Enter +* `Does this pool port support TLS/SSL? Use no if unknown. (y/N):` press y or n + +#### Advanced setup: +* `Use simple setup method?` n +* `Do you want to use the HTTP interface? Unlike the screen display, browser interface is not affected by the GPU lag. If you don't want to use it, please enter 0, otherwise enter port number that the miner should listen on` 5656 +* `Please enter the currency that you want to mine:` Enter currency or mining algorithm +* `Enter pool address (pool address:port):` Enter pool connection address:port +* `Username (wallet address or pool login):` Enter wallet address +* `Password (mostly empty or x):` press Enter +* `Rig identifier for pool-side statistics (needs pool support). Can be empty:` Enter rig name or press Enter +* `Does this pool port support TLS/SSL? Use no if unknown. (y/N)` Enter y or n +* `Do you want to use nicehash on this pool? (y/N)` n +* `Do you want to use multiple pools? (y/N)` Enter y if you want to se up backup pool or n + + +## Additional Guides and Feedback +[](https://www.youtube.com/c/xmrstak) +###### Video by Crypto Sewer + +To improve our support we created [Xmr-Stak forum](https://www.reddit.com/r/XmrStak). Check it out if you have a problem, or you are looking for most up to date config for your card and [guides](https://www.reddit.com/r/XmrStak/wiki/index). + + + + + + + + + +
+ +## Default Developer Donation +By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries. + +If you want to donate directly to support further development, here is my wallet + +fireice-uk: +``` +4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk +``` + +psychocrypt: +``` +45tcqnJMgd3VqeTznNotiNj4G9PQoK67TGRiHyj6EYSZ31NUbAfs9XdiU5squmZb717iHJLxZv3KfEw8jCYGL5wa19yrVCn +``` \ No newline at end of file diff --git a/doc/_img/2ragerx-btn.png b/doc/_img/2ragerx-btn.png new file mode 100644 index 000000000..1c0edd98c Binary files /dev/null and b/doc/_img/2ragerx-btn.png differ diff --git a/doc/_img/2xmr-stak-btn.png b/doc/_img/2xmr-stak-btn.png new file mode 100644 index 000000000..7626e27c1 Binary files /dev/null and b/doc/_img/2xmr-stak-btn.png differ diff --git a/doc/_img/YT.png b/doc/_img/YT.png new file mode 100644 index 000000000..cf7a869a2 Binary files /dev/null and b/doc/_img/YT.png differ diff --git a/doc/_img/cpu.png b/doc/_img/cpu.png new file mode 100644 index 000000000..6a370fbc9 Binary files /dev/null and b/doc/_img/cpu.png differ diff --git a/doc/_img/faq-green.png b/doc/_img/faq-green.png new file mode 100644 index 000000000..440a855b2 Binary files /dev/null and b/doc/_img/faq-green.png differ diff --git a/doc/_img/faq.png b/doc/_img/faq.png new file mode 100644 index 000000000..83167e3c7 Binary files /dev/null and b/doc/_img/faq.png differ diff --git a/doc/_img/features-xmr-stak.png b/doc/_img/features-xmr-stak.png new file mode 100644 index 000000000..ef75a3b14 Binary files /dev/null and b/doc/_img/features-xmr-stak.png differ diff --git a/doc/_img/features.png b/doc/_img/features.png new file mode 100644 index 000000000..37c877291 Binary files /dev/null and b/doc/_img/features.png differ diff --git a/doc/_img/fee.png b/doc/_img/fee.png new file mode 100644 index 000000000..cd3cdaf00 Binary files /dev/null and b/doc/_img/fee.png differ diff --git a/doc/_img/fine-tuning-green.png b/doc/_img/fine-tuning-green.png new file mode 100644 index 000000000..b58184bfa Binary files /dev/null and b/doc/_img/fine-tuning-green.png differ diff --git a/doc/_img/fine-tuning.png b/doc/_img/fine-tuning.png new file mode 100644 index 000000000..6b817cffe Binary files /dev/null and b/doc/_img/fine-tuning.png differ diff --git a/doc/_img/gpu.png b/doc/_img/gpu.png new file mode 100644 index 000000000..4d5578007 Binary files /dev/null and b/doc/_img/gpu.png differ diff --git a/doc/_img/header.png b/doc/_img/header.png new file mode 100644 index 000000000..8c9eeefad Binary files /dev/null and b/doc/_img/header.png differ diff --git a/doc/_img/how-to-compile-green.png b/doc/_img/how-to-compile-green.png new file mode 100644 index 000000000..e82c8b693 Binary files /dev/null and b/doc/_img/how-to-compile-green.png differ diff --git a/doc/_img/how-to-compile.png b/doc/_img/how-to-compile.png new file mode 100644 index 000000000..a54603484 Binary files /dev/null and b/doc/_img/how-to-compile.png differ diff --git a/doc/_img/html_reports.png b/doc/_img/html_reports.png new file mode 100644 index 000000000..2d17bc1bf Binary files /dev/null and b/doc/_img/html_reports.png differ diff --git a/doc/img/interleave.png b/doc/_img/interleave.png similarity index 100% rename from doc/img/interleave.png rename to doc/_img/interleave.png diff --git a/doc/_img/menu-donations-green.png b/doc/_img/menu-donations-green.png new file mode 100644 index 000000000..a299980d3 Binary files /dev/null and b/doc/_img/menu-donations-green.png differ diff --git a/doc/_img/menu-donations.png b/doc/_img/menu-donations.png new file mode 100644 index 000000000..f73facf6f Binary files /dev/null and b/doc/_img/menu-donations.png differ diff --git a/doc/_img/menu-features-green.png b/doc/_img/menu-features-green.png new file mode 100644 index 000000000..527d68d4c Binary files /dev/null and b/doc/_img/menu-features-green.png differ diff --git a/doc/_img/menu-features.png b/doc/_img/menu-features.png new file mode 100644 index 000000000..bcf71064d Binary files /dev/null and b/doc/_img/menu-features.png differ diff --git a/doc/_img/menu-get-miner-green.png b/doc/_img/menu-get-miner-green.png new file mode 100644 index 000000000..9e3bd5753 Binary files /dev/null and b/doc/_img/menu-get-miner-green.png differ diff --git a/doc/_img/menu-get-miner.png b/doc/_img/menu-get-miner.png new file mode 100644 index 000000000..891a35f16 Binary files /dev/null and b/doc/_img/menu-get-miner.png differ diff --git a/doc/_img/menu-support-green.png b/doc/_img/menu-support-green.png new file mode 100644 index 000000000..3db8e76ef Binary files /dev/null and b/doc/_img/menu-support-green.png differ diff --git a/doc/_img/menu-support.png b/doc/_img/menu-support.png new file mode 100644 index 000000000..5cd80e42f Binary files /dev/null and b/doc/_img/menu-support.png differ diff --git a/doc/_img/menu-supported-coins-green.png b/doc/_img/menu-supported-coins-green.png new file mode 100644 index 000000000..8678ea444 Binary files /dev/null and b/doc/_img/menu-supported-coins-green.png differ diff --git a/doc/_img/menu-supported-coins.png b/doc/_img/menu-supported-coins.png new file mode 100644 index 000000000..aabc37283 Binary files /dev/null and b/doc/_img/menu-supported-coins.png differ diff --git a/doc/_img/ragerx-btn.png b/doc/_img/ragerx-btn.png new file mode 100644 index 000000000..d08e245fc Binary files /dev/null and b/doc/_img/ragerx-btn.png differ diff --git a/doc/_img/ragerx.png b/doc/_img/ragerx.png new file mode 100644 index 000000000..bc2453d2a Binary files /dev/null and b/doc/_img/ragerx.png differ diff --git a/doc/_img/rx.png b/doc/_img/rx.png new file mode 100644 index 000000000..d9c4c3dfa Binary files /dev/null and b/doc/_img/rx.png differ diff --git a/doc/_img/split.png b/doc/_img/split.png new file mode 100644 index 000000000..11a8635b9 Binary files /dev/null and b/doc/_img/split.png differ diff --git a/doc/_img/stak-yt-cover.jpg b/doc/_img/stak-yt-cover.jpg new file mode 100644 index 000000000..ff21acebf Binary files /dev/null and b/doc/_img/stak-yt-cover.jpg differ diff --git a/doc/_img/troubleshooting-green.png b/doc/_img/troubleshooting-green.png new file mode 100644 index 000000000..d36cec8b8 Binary files /dev/null and b/doc/_img/troubleshooting-green.png differ diff --git a/doc/_img/troubleshooting.png b/doc/_img/troubleshooting.png new file mode 100644 index 000000000..e57eda740 Binary files /dev/null and b/doc/_img/troubleshooting.png differ diff --git a/doc/_img/usage-green.png b/doc/_img/usage-green.png new file mode 100644 index 000000000..c60b9a432 Binary files /dev/null and b/doc/_img/usage-green.png differ diff --git a/doc/_img/usage.png b/doc/_img/usage.png new file mode 100644 index 000000000..d9421ba66 Binary files /dev/null and b/doc/_img/usage.png differ diff --git a/doc/_img/xmr-stak-btn-active.png b/doc/_img/xmr-stak-btn-active.png new file mode 100644 index 000000000..68520be91 Binary files /dev/null and b/doc/_img/xmr-stak-btn-active.png differ diff --git a/doc/_img/xmr-stak-btn.png b/doc/_img/xmr-stak-btn.png new file mode 100644 index 000000000..0356f41aa Binary files /dev/null and b/doc/_img/xmr-stak-btn.png differ diff --git a/doc/_img/xmr-stak-cpu-connection.png b/doc/_img/xmr-stak-cpu-connection.png new file mode 100644 index 000000000..d07a8d0a9 Binary files /dev/null and b/doc/_img/xmr-stak-cpu-connection.png differ diff --git a/doc/_img/xmr-stak-cpu-hashrate.png b/doc/_img/xmr-stak-cpu-hashrate.png new file mode 100644 index 000000000..488a34825 Binary files /dev/null and b/doc/_img/xmr-stak-cpu-hashrate.png differ diff --git a/doc/_img/xmr-stak-cpu-results.png b/doc/_img/xmr-stak-cpu-results.png new file mode 100644 index 000000000..7244f9579 Binary files /dev/null and b/doc/_img/xmr-stak-cpu-results.png differ diff --git a/doc/_img/xmr-stak-rx-btn-inactive.png b/doc/_img/xmr-stak-rx-btn-inactive.png new file mode 100644 index 000000000..1644a9505 Binary files /dev/null and b/doc/_img/xmr-stak-rx-btn-inactive.png differ diff --git a/doc/_img/xmr-stak-rx-btn.png b/doc/_img/xmr-stak-rx-btn.png new file mode 100644 index 000000000..39f0c87f7 Binary files /dev/null and b/doc/_img/xmr-stak-rx-btn.png differ diff --git a/doc/_img/xmrig.png b/doc/_img/xmrig.png new file mode 100644 index 000000000..cdeaa4501 Binary files /dev/null and b/doc/_img/xmrig.png differ diff --git a/doc/compile.md b/doc/compile/compile.md similarity index 100% rename from doc/compile.md rename to doc/compile/compile.md diff --git a/doc/compile_FreeBSD.md b/doc/compile/compile_FreeBSD.md similarity index 100% rename from doc/compile_FreeBSD.md rename to doc/compile/compile_FreeBSD.md diff --git a/doc/compile_Linux.md b/doc/compile/compile_Linux.md similarity index 100% rename from doc/compile_Linux.md rename to doc/compile/compile_Linux.md diff --git a/doc/compile_Windows.md b/doc/compile/compile_Windows.md similarity index 92% rename from doc/compile_Windows.md rename to doc/compile/compile_Windows.md index 64d68bab1..37925576a 100644 --- a/doc/compile_Windows.md +++ b/doc/compile/compile_Windows.md @@ -111,6 +111,15 @@ Do not follow old information that you need the AMD APP SDK. AMD has removed the cd bin\Release + copy C:\xmr-stak-dep\openssl\bin\* . + ``` +- For Exclude some of dependence you can follow the command below to set the ENABLE to OFF + ``` + make -G "Visual Studio 15 2017 Win64" -T v141,host=x64 -DCMAKE_BUILD_TYPE=Release -DMICROHTTPD_ENABLE=OFF -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=OFF .. + cmake --build . --config Release --target clean + cmake --build . --config Release --target install + cd bin\Release + copy C:\xmr-stak-dep\openssl\bin\* . ``` - Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD OCL-SDK_light is installed) and CPUs. diff --git a/doc/compile_macOS.md b/doc/compile/compile_macOS.md similarity index 100% rename from doc/compile_macOS.md rename to doc/compile/compile_macOS.md diff --git a/doc/troubleshooting.md b/doc/troubleshooting.md new file mode 100644 index 000000000..fb0dc88ce --- /dev/null +++ b/doc/troubleshooting.md @@ -0,0 +1,119 @@ +# Troubleshooting +To improve our support we created [Xmr-Stak forum](https://www.reddit.com/r/XmrStak). Check it out if you have a problem, or you are looking for most up to date config for your card and [guides](https://www.reddit.com/r/XmrStak/wiki/index). + + +### 1. CL_MEM_OBJECT_ALLOCATION_FAILURE when calling clEnqueue +This error means that GPU can't allocate the requested amount of memory that is specified by your config. There is 2 known solutions of this problem: + +* Check if you occasionally use too many threads per one GPU (check *index* value in amd.txt) +* You set too high `intensity` value in amd.txt - try to reduce it to lower values (multiple to `worksize`) +* If you are using Windows - you may have not enough virtual memory in system. Add virtual memory (don't be afraid if it goes up to 60gb per 6 GPU rig) + + + +### 2. GPU is not detected +Check if you have antivirus software turned on. If yes - it could delete some .dll files (for example xmrstak\_cuda\_backend\_cuda10\_0.dll) + + + +### 3. Illegal Instruction +This typically means you are trying to run it on a CPU that does not have [AES](https://en.wikipedia.org/wiki/AES_instruction_set). This only happens on older version of miner, new version gives better error message (but still wont' work since your CPU doesn't support the required instructions). + + + +### 4. Internal compiler error +Seeing `g++: internal compiler error: Killed (program cc1plus)`is probably related to not enough RAM to compile. 1 Gb RAM should be enough (on clean Ubuntu 16.04). + + + +### 5. Invalid Result GPU ID +This error can be caused by several reasons, here is most common, known successful practices how to fix it: + +* **Hardware problem: overclock/overvoltage/undervoltage** \- try to use stock clocks and voltages. +* **Software problem: drivers** \- try to change driver versions (for AMD gpu most commonly stable versions are: blockchain drivers or 18.6.1) +* **Miner misconfiguration** \- try to reduce `intensity` (if AMD) or `threads` or `bfactor` (if NVIDIA) in config file. + +If you still receive these errors, [report please the issue](https://github.com/fireice-uk/xmr-stak/issues). + + +### 6. IP is banned +Pool has banned your IP, This can be caused by several reasons: + +* You selected wrong pool port or the static diff is too low. (Learn more about [pool ports and diff](https://www.reddit.com/r/XmrStak/wiki/guides/other-questions#wiki_1._pool_ports_and_difficulty)) +* You had too many [invalid shares \[8\]](https://www.reddit.com/r/XmrStak/wiki/troubleshooting#wiki_8._invalid_result_gpu_id) + + + +### 7. MEMORY ALLOC FAILED: mmap failed +On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`). + +Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time). Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work). To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run xmr-stak as (example shows bad/low sample defaults): + + $ /sbin/sysctl vm.nr_hugepages ; ulimit -l vm.nr_hugepages = 0 16 + +To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`: + + vm.nr_hugepages=128 + +You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot). In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits) + +To increase the memlock (`ulimit -l`), add following lines to `/etc/security/limits.d/60-memlock.conf`: + + * - memlock 262144 root - memlock 262144 + +You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session). Recheck after completing these steps to validate: + + $ /sbin/sysctl vm.nr_hugepages ; ulimit -l vm.nr_hugepages = 128 262144 + +You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly). + + +### 8. msvcp140.dll and vcruntime140.dll are not available +Download and install this [runtime package](https://go.microsoft.com/fwlink/?LinkId=746572) from Microsoft. + +>***Warning***\*: Do NOT use "missing dll" sites - dll's are exe files with another name, and it is a fairly safe bet that any dll on a shady site like that will be trojaned. Please download offical runtimes from Microsoft above.\* + + + +###9. Obtaining SeLockMemoryPrivilege failed. +For professional versions of Windows see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx). Make sure to reboot afterwards! + +**For Windows 7/10 Home:** + +1. Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore any incompatibility warning during installation. +2. Open cmd or PowerShell as an administrator. +3. `Use ntrights -u %USERNAME% +r SeLockMemoryPrivilege`where `%USERNAME%` is the user that will be running the program. +4. Reboot. + +Reference: [http://rybkaforum.net/cgi-bin/rybkaforum/topic\_show.pl?pid=259791#pid259791](http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791) + +*Warning: Do not download ntrights.exe from any other site other than the offical Microsoft download page.* + + +### 10. Share rejected - Low diff share +Check if a coin that you are mining has changed algorithm in one of its forks and you use right hashing algorithm in pools.txt (parameter: `currency`). + + + +### 11. VirtualAlloc failed +If you set up the user rights properly ([see issue #7](https://www.reddit.com/r/XmrStak/wiki/troubleshooting#wiki_7._memory_alloc_failed.3A_mmap_failed)), and your system has 4-8GB of RAM (and 50%+ is in use), there is a significant chance that there simply won't be a large enough chunk of contiguous memory because Windows is fairly bad at mitigating memory fragmentation. + +If that happens, disable all auto-starting applications and run the miner after a reboot. + + +### 12. (Ubuntu compiling) - Nvidia insufficient driver +If you have this error after compiling xmr-stak in Ubuntu - make sure you have the latest drivers and not X.org.X Nouveau or v390. Install them manually or with [cuda package](https://www.reddit.com/r/XmrStak/wiki/guides/startup#wiki_2._ubuntu_18.10_setup_.2B_nvidia_.28compiling_from_source.29) + + + +### 13. (Ubuntu compiling) - Could NOT find OpenCL (missing: OpenCL_LIBRARY OpenCL_INCLUDE_DIR) Cmake error at CmakeLists.txt +When [compiling in Ubuntu with Nvidia](https://www.reddit.com/r/XmrStak/wiki/guides/startup#wiki_2._ubuntu_18.10_setup_.2B_nvidia_.28compiling_from_source.29) devices, and running `cmake ..` command add additional param that disables OpenCL: `cmake .. -DOpenCL_ENABLE=OFF` + + + +### 14. (Ubuntu compiling) - gcc v8 is not supported +Cuda 10 ships with gcc and g++ ver.8 which is not supported. Make sure you [set gcc and g++ to v6](https://www.reddit.com/r/XmrStak/wiki/guides/startup#wiki_2.2_compiling) before compiling. (step 2.2.6) + + + + diff --git a/doc/tuning.md b/doc/tuning.md index 6d07d4ddc..a504b85ef 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -3,41 +3,59 @@ ## Content Overview * [Benchmark](#benchmark) * [Windows](#windows) +* [Managing GPUs](#managing-GPUs) * [NVIDIA Backend](#nvidia-backend) * [Choose Value for `threads` and `blocks`](#choose-value-for-threads-and-blocks) * [Add more GPUs](#add-more-gpus) * [AMD Backend](#amd-backend) * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) - * [Add more GPUs](#add-more-gpus) - * [Two Threads per GPU](two-threads-per-gpu) - * [Interleave Tuning](interleave-tuning ) + * [Two Threads per GPU](two-threads-per-GPU) + * [Interleave Tuning](interleave-tuning) * [disable comp_mode](#disable-comp_mode) - * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern) + * [Auto-tune](#auto-tune) + * [Change the scratchpad memory pattern](change-the-scratchpad-memory-pattern) * [Increase Memory Pool](#increase-memory-pool) * [Scratchpad Indexing](#scratchpad-indexing) * [CPU Backend](#cpu-backend) * [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode) ## Benchmark -To benchmark the miner speed there are two ways. - - Mine against a pool end press the key `h` after 30 sec to see the hash report. - - Start the miner with the cli option `--benchmark BLOCKVERSION`. The miner will not connect to any pool and performs a 60sec performance benchmark with all enabled back-ends. +You can benchmark the miner in two ways: + - Edit `config.txt` and set `verbose_level` to 4 and `h_print_time` to 30 and start the miner. You will see hash report each 30 seconds. + - Start the miner with the cli option `--benchmark BLOCKVERSION`. The miner will not connect to any pool and performs a 60sec performance benchmark with all enabled backends. ## Windows "Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7. On Windows 10 it is only needed once to set up the account to use them. Disable the dialog with the command line option `--noUAC` +### Managing GPUs + +To turn on and off a GPU you need to add/remove config set to `GPU_threads_conf`. +`index` is the number of the GPU, the index order not follow the order from `nvidia-smi` or the order shown in windows. + +``` +"GPU_threads_conf" : +[ + { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, + "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, + }, + { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, + "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, + }, +], +``` + ## NVIDIA Backend By default the NVIDIA backend can be tuned in the config file `nvidia.txt` ### Choose Value for `threads` and `blocks` -The optimal parameter for the `threads` and `blocks` option in `config.txt` depend on your GPU. -For all GPU's with a compute capability `>=2.0` and `<6.0` there is a restriction of the amount of RAM that can be used for the mining algorithm. -The maximum RAM that can be used must be less than 2GB (e.g. GTX TITAN) or 1GB (e.g. GTX 750-TI). -The amount of RAM used for mining can be changed with `"threads" : T, "blocks : B"`. +The optimal values for the `threads` and `blocks` parameters in `nvidia.txt` depend on your GPU model and selected mining algorithm. +For all GPU's with a compute capability `>=2.0` and `<6.0` there is a restriction of the amount of vRAM that can be used for the mining algorithm. +The maximum vRAM that can be used must be less than 2GB (e.g. GTX TITAN) or 1GB (e.g. GTX 750-TI). +The amount of vRAM used for mining can be changed with `"threads" : T, "blocks : B"`. - `T` = threads used per block - `B` = CUDA blocks started (should be a multiple of the multiprocessors `M` on the GPU) @@ -48,23 +66,6 @@ and full fill all restrictions `16 * 48 * 2 = 1536` and `48 mod 24 = 0`. The memory limit for NVIDIA Pascal GPUs is `16` GiB if the newest CUDA driver is used. -### Add More GPUs - -To add a new GPU you need to add a new config set to `gpu_threads_conf`. -`index` is the number of the gpu, the index order not follow the order from `nvidia-smi` or the order shown in windows. - -``` -"gpu_threads_conf" : -[ - { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, - }, - { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, - }, -], -``` - ## AMD Backend By default the AMD backend can be tuned in the config file `amd.txt` @@ -75,38 +76,16 @@ Intensity means the number of threads used to mine. The maximum intensity is GPU `worksize` is the number of threads working together to increase the miner performance. In the most cases a `worksize` of `16` or `8` is optimal. -### Add More GPUs - -To add a new GPU you need to add a new config set to `gpu_threads_conf`. `index` is the OpenCL index of the gpu. -`platform_index`is the index of the OpenCL platform (Intel / AMD / Nvidia). -If you are unsure of either GPU or platform index value, you can use `clinfo` tool that comes with AMD APP SDK to dump the values. - -``` -"gpu_threads_conf" : -[ - { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, - "interleave" : 40 - }, - { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, - "interleave" : 40 - }, -], - -"platform_index" : 0, -``` ### Two Threads per GPU -Some GPUs like AMD Vega can mine faster if two threads are using the same GPU. -Use the auto generated config as base and repeat the config entry for a GPU. -If the attribute `index` is used twice than two threads will use one GPU. -Take care that the required memory usage on the GPU will also double. -Therefore adjust your intensity by hand. +Some AMD GPUs can mine faster on some mining algorithms if two threads are using the same GPU. +If you have `amd.txt` config with one `index` entry per GPU - duplicate these entries to run 2 threads per GPU. +*Notice*: Keep in mind that the memory usage on the GPU will also double - therefore adjust your `intensity` by hand. +Example of 2-threaded config: ``` -"gpu_threads_conf" : +"GPU_threads_conf" : [ { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, @@ -123,43 +102,88 @@ Therefore adjust your intensity by hand. ### Interleave Tuning -Interleave controls when a worker thread is starting to calculate a bunch of hashes -if two worker threads are used to utilize one GPU. -This option has no effect if only one worker thread is used per GPU. +**Note 1:** Interleaving is available for AMD GPUs only. -![Interleave](img/interleave.png) +**Note 2** Interleaving has effect only when 2+ threads are used per each GPU. -Interleave defines how long a thread needs to wait to start the next hash calculation relative to the last started worker thread. -To choose a interleave value larger than 50% makes no sense because than the gpu will not be utilized well enough. -In the most cases the default 40 is a good value but on some systems e.g. Linux Rocm 1.9.1 driver with RX5XX you need to adjust the value. -If you get many interleave message in a row (over 1 minute) you should adjust the value. +Interleave controls when a worker thread is starting to calculate hashes if two worker threads are used to utilize one GPU. This parameter is designed to reduce total idle periods of GPU while mining -``` -OpenCL Interleave 0|1: 642/2400.50 ms - 30.1 -OpenCL Interleave 0|0: 355/2265.05 ms - 30.2 -OpenCL Interleave 0|1: 221/2215.65 ms - 30.2 -``` +![Interleave](_img/interleave.png) -description: -``` -|: / ms - +**1.Reading and understanding the log:** -``` -`last delay` should gou slowly to 0. -If it goes down and than jumps to a very large value multiple times within a minute you should reduce the intensity by 5. -The `intensity value` will automatically go up and down within the range of +-5% to adjust kernel run-time fluctuations. -Automatic adjustment is disabled as long as `auto-tuning` is active and will be started after it is finished. -If `last delay` goes down to 10ms and the messages stops and repeated from time to time with delays up to 15ms you will have already a good value. +`OpenCL Interleave 0|0: 265/1372.30 ms - 40.1` +`OpenCL Interleave 0|1: 125/1330.10 ms - 40.2` +`OpenCL Interleave 0|0: 74/1323.67 ms - 40.2` +`OpenCL Interleave 0|1: 43/1312.01 ms - 40.2` +`OpenCL Interleave 0|1: 16/1283.20 ms - 40.2` + +Reads as: +`OpenCL Interleave GPU ID|Thread ID: last delay/average calculation time per hash bunch - interleave value` + + +**2.Do I need to adjust it?** +In general, interleaving can be used as representation how 2-threading works with your GPU at current set of settings (including GPU power profile, miner settings, drivers). And default value `"interleave" : 40` in `amd.txt` works good in most cases. + +2.1 Optimal setup: After you started mining you have `last delay` value reduced over time to minimum possible value and stays at it. The best scenario is when `last delay` value settled around 10-15 and interleave messages appear rarely. The reported hashrate will be close to max. of GPU capabilities. + +2.2 Not optimal setup: After you started mining you have `last delay` value reducing over time and jumping back to high values, or rising after the start of mining. The reported hashrate will be lower compared to max. possible. + +**3.Adjusting Interleaving and optimizing hashrate** +**Note:** setting `interleave` value in amd.txt higher than 50 has no practical sense + +If you faced situation described in 2.2 then you need to keep in mind that this can be caused by several possible reasons, so treat them accordingly and start miner after each attempt and check logs and hashrate: + +- Miner misconfiguration 1: Adjust "interleave" in amd.txt by couple points +/- +- Miner misconfiguration 2: Adjust "intensity" in amd.txt by setting lower value (multiple to "worksize" value) +- GPU overclock: Reduce overclock/overvoltage values of GPU memory and GPU core +- Drivers issue: Try [reinstalling your drivers](https://www.amd.com/en/support) (there are 3 possible options to try: blockchain drivers, v18.6.1, or newest version) +​ ### disable comp_mode `comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices. To avoid miner crashes the `intensity` should be a multiple of `worksize` if `comp_mode` is `false`. -### change the scratchpad memory pattern +### Auto tune + +**Note:** This feature is available for AMD gpus only. + +Auto-tuning feature may help you to speed up seek process of finding optimal intensity for your GPU (vs manual check, in case if you want to compare autogenerated intensity with the most performing value). + +When set, miner will perform several (defined by user) rounds per each intensity check of given range. When setting number of rounds - keep in mind that you want to have a balance of speed and reliability of the checking. + +After setting number of checks per intensity value, you will need to set ceiling value after which the miner will stop checking intensity values. + +**1.Enabling and configuring auto-tune** +Navigate to amd.txt config file in miner's folder, find (in the bottom part) parameter "auto_tune" : 0, and set it to "auto_tune" : 6, (6-10 rounds per intensity value suits most cases.) +Set autogenerated value of "intensity" : X, for each thread in amd.txt to slightly higher level (e.g. from 890 to 1000) +Start xmr-stak.exe + +**2. Reading and understanding the log** +Here is an example of log for 1 GPU with 2 threads (your values will vary): +`OpenCL 0|0: auto-tune validate intensity 848|840` +`OpenCL 0|1: auto-tune validate intensity 848|840` +`OpenCL 0|0: auto-tune validate intensity 856|848` +`OpenCL 0|1: auto-tune validate intensity 856|848` +Reads as: `OpenCL GPU ID|Thread ID auto-tune validate intensity Currently checked value|last succesfully checked value` + +After the checking, you will see + +`OpenCL 0|0: lock intensity at 896` +`OpenCL 0|1: lock intensity at 896` +Write down these locked intensity values and stop miner. + +**3. Finalizing setup** +Set "auto_tune" value (step 1.1) in `amd.txt` back to "auto_tune" : 0, +Enter locked intensity values from step 2. +Start miner. + +### Change the scratchpad memory pattern By changing `strided_index` to `2` the number of contiguous elements (a 16 byte) for one miner thread can be fine tuned with the option `mem_chunk`. + ### Increase Memory Pool By setting the following environment variables before the miner is started OpenCl allows the miner to more threads. diff --git a/doc/usage.md b/doc/usage.md index 82d26dcc5..800ff6949 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -1,9 +1,9 @@ -# HowTo Use xmr-stak +# HowTo Use Xmr-Stak ## Content Overview -* [Configuration](#configuration) +* [Configurations](#configurations) * [Usage on Windows](#usage-on-windows) -* [Usage on Linux](#usage-on-linux) +* [Usage on Linux & macOS](#usage-on-linux--macos) * [Command Line Options](#command-line-options) * [Use different backends](#use-different-backends) * [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton) @@ -77,6 +77,6 @@ Debug the docker image by getting inside: docker run --entrypoint=/bin/bash --rm -it -u $(id -u):$(id -g) --name fireice-uk/xmr-stak -v "$PWD":/mnt xmr-stak ``` -## HTML and JSON API report configuraton +## HTML and JSON API report configuration To configure the reports shown on the [README](../README.md) side you need to edit the httpd_port variable. Then enable wifi on your phone and navigate to [miner ip address]:[httpd_port] in your phone browser. If you want to use the data in scripts, you can get the JSON version of the data at url [miner ip address]:[httpd_port]/api.json diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp index ccb836e41..2a60c46d9 100644 --- a/xmrstak/backend/amd/OclCryptonightR_gen.cpp +++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp @@ -1,19 +1,18 @@ -#include -#include -#include #include +#include +#include +#include #include - #include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" #include "xmrstak/backend/cpu/crypto/variant4_random_math.h" -#include "xmrstak/misc/console.hpp" #include "xmrstak/cpputil/read_write_lock.h" +#include "xmrstak/misc/console.hpp" #include -#include #include - +#include +#include namespace xmrstak { @@ -22,16 +21,16 @@ namespace amd static std::string get_code(const V4_Instruction* code, int code_size) { - std::stringstream s; + std::stringstream s; - for (int i = 0; i < code_size; ++i) + for(int i = 0; i < code_size; ++i) { const V4_Instruction inst = code[i]; const uint32_t a = inst.dst_index; const uint32_t b = inst.src_index; - switch (inst.opcode) + switch(inst.opcode) { case MUL: s << 'r' << a << "*=r" << b << ';'; @@ -58,37 +57,39 @@ static std::string get_code(const V4_Instruction* code, int code_size) s << '\n'; } - return s.str(); + return s.str(); } struct CacheEntry { - CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) : - algo(algo), - height(height), - deviceIdx(deviceIdx), - program(program) - {} - - xmrstak_algo algo; - uint64_t height; - size_t deviceIdx; - cl_program program; + CacheEntry(xmrstak_algo algo, uint64_t height_offset, size_t deviceIdx, cl_program program) : + algo(algo), + height_offset(height_offset), + deviceIdx(deviceIdx), + program(program) + { + } + + xmrstak_algo algo; + uint64_t height_offset; + size_t deviceIdx; + cl_program program; }; struct BackgroundTaskBase { - virtual ~BackgroundTaskBase() {} - virtual void exec() = 0; + virtual ~BackgroundTaskBase() {} + virtual void exec() = 0; }; -template +template struct BackgroundTask : public BackgroundTaskBase { - BackgroundTask(T&& func) : m_func(std::move(func)) {} - void exec() override { m_func(); } + BackgroundTask(T&& func) : + m_func(std::move(func)) {} + void exec() override { m_func(); } - T m_func; + T m_func; }; static ::cpputil::RWLock CryptonightR_cache_mutex; @@ -99,94 +100,113 @@ static std::mutex background_tasks_mutex; static std::vector background_tasks; static std::thread* background_thread = nullptr; +static cl_program search_program( + const GpuContext* ctx, + xmrstak_algo algo, + uint64_t height_offset, + bool lock_cache = true) +{ + if(lock_cache) + CryptonightR_cache_mutex.ReadLock(); + + // Check if the cache has this program + for(const CacheEntry& entry : CryptonightR_cache) + { + if((entry.algo == algo) && (entry.height_offset == height_offset) && (entry.deviceIdx == ctx->deviceIdx)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu found in cache", height_offset); + auto result = entry.program; + if(lock_cache) + CryptonightR_cache_mutex.UnLock(); + return result; + } + } + if(lock_cache) + CryptonightR_cache_mutex.UnLock(); + + return nullptr; +} + static void background_thread_proc() { - std::vector tasks; - for (;;) { - tasks.clear(); - { - std::lock_guard g(background_tasks_mutex); - background_tasks.swap(tasks); - } - - for (BackgroundTaskBase* task : tasks) { - task->exec(); - delete task; - } + std::vector tasks; + for(;;) + { + tasks.clear(); + { + std::lock_guard g(background_tasks_mutex); + background_tasks.swap(tasks); + } + + for(BackgroundTaskBase* task : tasks) + { + task->exec(); + delete task; + } std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } + } } -template +template static void background_exec(T&& func) { - BackgroundTaskBase* task = new BackgroundTask(std::move(func)); + BackgroundTaskBase* task = new BackgroundTask(std::move(func)); - std::lock_guard g(background_tasks_mutex); - background_tasks.push_back(task); - if (!background_thread) { - background_thread = new std::thread(background_thread_proc); - } + std::lock_guard g(background_tasks_mutex); + background_tasks.push_back(task); + if(!background_thread) + { + background_thread = new std::thread(background_thread_proc); + } } static cl_program CryptonightR_build_program( - const GpuContext* ctx, - xmrstak_algo algo, - uint64_t height, - uint32_t precompile_count, - std::string source_code, - std::string options) + const GpuContext* ctx, + xmrstak_algo algo, + uint64_t height_offset, + uint64_t height_chunk_size, + uint32_t precompile_count, + std::string source_code, + std::string options) { - std::vector old_programs; - old_programs.reserve(32); - { + std::vector old_programs; + old_programs.reserve(32); + { CryptonightR_cache_mutex.WriteLock(); - // Remove old programs from cache - for(size_t i = 0; i < CryptonightR_cache.size();) - { - const CacheEntry& entry = CryptonightR_cache[i]; - if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); - old_programs.push_back(entry.program); - CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); - CryptonightR_cache.pop_back(); - } - else - { - ++i; - } - } + // Remove old programs from cache + for(size_t i = 0; i < CryptonightR_cache.size();) + { + const CacheEntry& entry = CryptonightR_cache[i]; + if((entry.algo == algo) && (entry.height_offset + (2 + precompile_count) * height_chunk_size < height_offset)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu released (old program)", entry.height_offset); + old_programs.push_back(entry.program); + CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); + CryptonightR_cache.pop_back(); + } + else + { + ++i; + } + } CryptonightR_cache_mutex.UnLock(); - } - - for(cl_program p : old_programs) { - clReleaseProgram(p); - } + } - std::lock_guard g1(CryptonightR_build_mutex); + for(cl_program p : old_programs) + { + clReleaseProgram(p); + } - cl_program program = nullptr; - { - CryptonightR_cache_mutex.ReadLock(); + std::lock_guard g1(CryptonightR_build_mutex); - // Check if the cache already has this program (some other thread might have added it first) - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) - { - program = entry.program; - break; - } - } - CryptonightR_cache_mutex.UnLock(); - } + cl_program program = search_program(ctx, algo, height_offset); - if (program) { - return program; - } + if(program) + { + return program; + } cl_int ret; const char* source = source_code.c_str(); @@ -194,7 +214,7 @@ static cl_program CryptonightR_build_program( program = clCreateProgramWithSource(ctx->opencl_ctx, 1, (const char**)&source, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L0,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return program; } @@ -202,11 +222,11 @@ static cl_program CryptonightR_build_program( if(ret != CL_SUCCESS) { size_t len; - printer::inst()->print_msg(L0,"Error %s when calling clBuildProgram.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clBuildProgram.", err_to_str(ret)); if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) { - printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); return program; } @@ -216,12 +236,12 @@ static cl_program CryptonightR_build_program( if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) { free(BuildLog); - printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); return program; } printer::inst()->print_str("Build log:\n"); - std::cerr<DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); return program; } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } - while(status == CL_BUILD_IN_PROGRESS); + } while(status == CL_BUILD_IN_PROGRESS); + CryptonightR_cache_mutex.WriteLock(); + auto cached_program = search_program(ctx, algo, height_offset, false); - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); + if(cached_program) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: release already existing program %llu", height_offset); + clReleaseProgram(program); + program = cached_program; + } + else + { + CryptonightR_cache.emplace_back(algo, height_offset, ctx->deviceIdx, program); + printer::inst()->print_msg(LDEBUG, "CryptonightR: cache compiled program for height_offset %llu", height_offset); + } - CryptonightR_cache_mutex.WriteLock(); - CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program); CryptonightR_cache_mutex.UnLock(); - return program; + return program; } -cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background) +cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background) { - printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height); - - if (background) { - background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false); }); - return nullptr; - } - - const char* source_code_template = - #include "amd_gpu/opencl/wolf-aes.cl" - #include "amd_gpu/opencl/cryptonight_r.cl" - ; - const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; - const char* offset = strstr(source_code_template, include_name); - if (!offset) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo); - return nullptr; - } - - V4_Instruction code[256]; - int code_size; - switch (algo.Id()) - { - case cryptonight_r_wow: - code_size = v4_random_math_init(code, height); - break; - case cryptonight_r: - code_size = v4_random_math_init(code, height); - break; - default: - printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo); - return nullptr; - } - - std::string source_code(source_code_template, offset); - source_code.append(get_code(code, code_size)); - source_code.append(offset + sizeof(include_name) - 1); + if(background) + { + background_exec([=]() { CryptonightR_get_program(ctx, algo, height_offset, height_chunk_size, precompile_count, false); }); + return nullptr; + } + + auto program = search_program(ctx, algo, height_offset); + + if(program != nullptr) + return program; + + printer::inst()->print_msg(LDEBUG, "CryptonightR: create code for block %llu to %llu", height_offset, height_offset + height_chunk_size); + + const char* source_code_definitions = +#include "amd_gpu/opencl/cryptonight_r_def.rtcl" +#include "amd_gpu/opencl/wolf-aes.cl" + ; + + const char* source_code_template = +#include "amd_gpu/opencl/cryptonight_r.rtcl" + ; + const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; + const char* offset = strstr(source_code_template, include_name); + if(!offset) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo); + return nullptr; + } + + std::string source_code(source_code_definitions); + + for(uint64_t c = 0; c < height_chunk_size; ++c) + { + V4_Instruction code[256]; + int code_size; + switch(algo.Id()) + { + case cryptonight_r_wow: + code_size = v4_random_math_init(code, height_offset + c); + break; + case cryptonight_r: + code_size = v4_random_math_init(code, height_offset + c); + break; + default: + printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo); + return nullptr; + } + + std::string kernel_code(source_code_template, offset); + kernel_code.append(get_code(code, code_size)); + kernel_code.append(offset + sizeof(include_name) - 1); + + std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height_offset + c); + + source_code += std::regex_replace(kernel_code, std::regex("cn1_cryptonight_r"), kernel_name); + } // scratchpad size for the selected mining algorithm size_t hashMemSize = algo.Mem(); @@ -324,28 +372,12 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t if(algo == cryptonight_gpu) options += " -cl-fp32-correctly-rounded-divide-sqrt"; + program = search_program(ctx, algo, height_offset); - const char* source = source_code.c_str(); - - { - CryptonightR_cache_mutex.ReadLock(); - - // Check if the cache has this program - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); - auto result = entry.program; - CryptonightR_cache_mutex.UnLock(); - return result; - } - } - CryptonightR_cache_mutex.UnLock(); - - } + if(program != nullptr) + return program; - return CryptonightR_build_program(ctx, algo, height, precompile_count, source, options); + return CryptonightR_build_program(ctx, algo, height_offset, precompile_count, height_chunk_size, source_code, options); } } // namespace amd diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp index 7dce77b85..f8772b1f5 100644 --- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp +++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp @@ -3,8 +3,8 @@ #include "xmrstak/backend/cryptonight.hpp" #include -#include #include +#include #if defined(__APPLE__) #include @@ -20,7 +20,7 @@ namespace amd { cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo, - uint64_t height, uint32_t precompile_count, bool background = false); + uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background = false); } // namespace amd } // namespace xmrstak diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 9f3f75469..3c4384722 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -13,45 +13,43 @@ * along with this program. If not, see . */ +#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" #include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" -#include "xmrstak/picosha2/picosha2.hpp" +#include "xmrstak/net/msgstruct.hpp" #include "xmrstak/params.hpp" +#include "xmrstak/picosha2/picosha2.hpp" #include "xmrstak/version.hpp" -#include "xmrstak/net/msgstruct.hpp" -#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" +#include +#include +#include +#include +#include #include #include -#include -#include #include -#include -#include -#include -#include #include +#include #include -#include #include -#include #include +#include #if defined _MSC_VER #include #elif defined __GNUC__ -#include #include +#include #endif - #ifdef _WIN32 #include static inline void create_directory(std::string dirname) { - _mkdir(dirname.data()); + _mkdir(dirname.data()); } static inline void port_sleep(size_t sec) @@ -59,8 +57,8 @@ static inline void port_sleep(size_t sec) Sleep(sec * 1000); } #else -#include #include +#include static inline void create_directory(std::string dirname) { @@ -100,7 +98,7 @@ char* LoadTextFile(const char* filename) flen = ftell(kernel); fseek(kernel, 0, SEEK_SET); - out = (char*)malloc(flen+1); + out = (char*)malloc(flen + 1); size_t r = fread(out, flen, 1, kernel); fclose(kernel); @@ -121,7 +119,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaximumWorkSize, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret)); return ERR_OCL_API; } @@ -140,16 +138,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ */ MaximumWorkSize /= 8; } - printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); + printer::inst()->print_msg(L1, "Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); if(ctx->workSize > MaximumWorkSize) { ctx->workSize = MaximumWorkSize; - printer::inst()->print_msg(L1,"Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); + printer::inst()->print_msg(L1, "Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); } const std::string backendName = xmrstak::params::inst().openCLVendor; - if( (ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0) + if((ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0) { size_t reduced_intensity = (ctx->rawIntensity / ctx->workSize) * ctx->workSize; ctx->rawIntensity = reduced_intensity; @@ -157,29 +155,29 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) - const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 }; + const cl_queue_properties CommandQueueProperties[] = {0, 0, 0}; ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret); #else - const cl_command_queue_properties CommandQueueProperties = { 0 }; + const cl_command_queue_properties CommandQueueProperties = {0}; ctx->CommandQueues = clCreateCommandQueue(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret); #endif if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret)); return ERR_OCL_API; } if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx); return ERR_OCL_API; } ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -193,14 +191,14 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); return ERR_OCL_API; } ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -208,7 +206,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[2] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -216,7 +214,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[3] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -224,7 +222,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[4] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -232,7 +230,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[5] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -240,21 +238,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->OutputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * 0x100, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret)); return ERR_OCL_API; } std::vector devNameVec(1024); if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret),ctx->deviceIdx ); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret), ctx->deviceIdx); return ERR_OCL_API; } std::vector openCLDriverVer(1024); if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx ); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret), ctx->deviceIdx); return ERR_OCL_API; } @@ -342,11 +340,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if(xmrstak::params::inst().AMDCache == false || !clBinFile.good()) { if(xmrstak::params::inst().AMDCache) - printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code %s not found. Compiling ...", ctx->deviceIdx, cache_file.c_str()); ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return ERR_OCL_API; } @@ -354,11 +352,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if(ret != CL_SUCCESS) { size_t len; - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram.", err_to_str(ret)); if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); return ERR_OCL_API; } @@ -368,28 +366,27 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) { free(BuildLog); - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); return ERR_OCL_API; } printer::inst()->print_str("Build log:\n"); - std::cerr<Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); - + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL); std::vector devices_ids(num_devices); - clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id) * devices_ids.size(), devices_ids.data(), NULL); int dev_id = 0; /* Search for the gpu within the program context. * The id can be different to ctx->DeviceID. */ - for(auto & ocl_device : devices_ids) + for(auto& ocl_device : devices_ids) { if(ocl_device == ctx->DeviceID) break; @@ -401,17 +398,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ { if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); return ERR_OCL_API; } port_sleep(1); - } - while(status == CL_BUILD_IN_PROGRESS); + } while(status == CL_BUILD_IN_PROGRESS); if(xmrstak::params::inst().AMDCache) { std::vector binary_sizes(num_devices); - clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); std::vector all_programs(num_devices); std::vector> program_storage; @@ -419,7 +415,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ int p_id = 0; size_t mem_size = 0; // create memory structure to query all OpenCL program binaries - for(auto & p : all_programs) + for(auto& p : all_programs) { program_storage.emplace_back(std::vector(binary_sizes[p_id])); all_programs[p_id] = program_storage[p_id].data(); @@ -427,9 +423,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ p_id++; } - if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramInfo.", err_to_str(ret)); return ERR_OCL_API; } @@ -437,12 +433,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary); file_stream.write(all_programs[dev_id], binary_sizes[dev_id]); file_stream.close(); - printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s", ctx->deviceIdx, cache_file.c_str()); } } else { - printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s", ctx->deviceIdx, cache_file.c_str()); std::ostringstream ss; ss << clBinFile.rdbuf(); std::string s = ss.str(); @@ -453,22 +449,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ cl_int clStatus; ctx->Program[miner_algo] = clCreateProgramWithBinary( opencl_ctx, 1, &ctx->DeviceID, &bin_size, - (const unsigned char **)&data_ptr, &clStatus, &ret - ); + (const unsigned char**)&data_ptr, &clStatus, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); + printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); + printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } } - std::vector KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" }; + std::vector KernelNames = {"cn2", "Blake", "Groestl", "JH", "Skein"}; if(miner_algo == cryptonight_gpu) { KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu"); @@ -494,7 +489,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); + printer::inst()->print_msg(L1, "Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); return ERR_OCL_API; } } @@ -508,30 +503,28 @@ const cl_platform_info attributeTypes[5] = { CL_PLATFORM_VENDOR, CL_PLATFORM_VERSION, CL_PLATFORM_PROFILE, - CL_PLATFORM_EXTENSIONS -}; + CL_PLATFORM_EXTENSIONS}; const char* const attributeNames[] = { "CL_PLATFORM_NAME", "CL_PLATFORM_VENDOR", "CL_PLATFORM_VERSION", "CL_PLATFORM_PROFILE", - "CL_PLATFORM_EXTENSIONS" -}; + "CL_PLATFORM_EXTENSIONS"}; -#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) +#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) uint32_t getNumPlatforms() { cl_uint num_platforms = 0; - cl_platform_id * platforms = NULL; + cl_platform_id* platforms = NULL; cl_int clStatus; // Get platform and device information clStatus = clGetPlatformIDs(0, NULL, &num_platforms); if(clStatus != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus)); return 0u; } @@ -554,29 +547,29 @@ std::vector getAMDDevices(int index) platforms.resize(numPlatforms); if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); return ctxVec; } - if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS) + if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); return ctxVec; } device_list.resize(num_devices); - if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS) + if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); return ctxVec; } - for (size_t k = 0; k < num_devices; k++) + for(size_t k = 0; k < num_devices; k++) { std::vector devVendorVec(1024); if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k); continue; } @@ -596,19 +589,19 @@ std::vector getAMDDevices(int index) if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k); continue; } if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); continue; } if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k); continue; } @@ -618,14 +611,14 @@ std::vector getAMDDevices(int index) if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); continue; } std::vector openCLDriverVer(1024); if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k); continue; } @@ -636,7 +629,7 @@ std::vector getAMDDevices(int index) ctx.name = std::string(devNameVec.data()); ctx.DeviceID = device_list[k]; ctx.interleave = 40; - printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); + printer::inst()->print_msg(L0, "Found OpenCL GPU %s.", ctx.name.c_str()); ctxVec.push_back(ctx); } } @@ -651,13 +644,13 @@ int getAMDPlatformIdx() if(numPlatforms == 0) { - printer::inst()->print_msg(L0,"WARNING: No OpenCL platform found."); + printer::inst()->print_msg(L0, "WARNING: No OpenCL platform found."); return -1; } - cl_platform_id * platforms = NULL; + cl_platform_id* platforms = NULL; cl_int clStatus; - platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms); + platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms); clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); int platformIndex = -1; @@ -666,7 +659,8 @@ int getAMDPlatformIdx() if(clStatus == CL_SUCCESS) { - for (int i = 0; i < numPlatforms; i++) { + for(int i = 0; i < numPlatforms; i++) + { size_t infoSize; clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &infoSize); std::vector platformNameVec(infoSize); @@ -675,13 +669,13 @@ int getAMDPlatformIdx() std::string platformName(platformNameVec.data()); bool isAMDOpenCL = platformName.find("Advanced Micro Devices") != std::string::npos || - platformName.find("Apple") != std::string::npos || - platformName.find("Mesa") != std::string::npos; + platformName.find("Apple") != std::string::npos || + platformName.find("Mesa") != std::string::npos; bool isNVIDIADevice = platformName.find("NVIDIA Corporation") != std::string::npos || platformName.find("NVIDIA") != std::string::npos; std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor; if((isAMDOpenCL && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA")) { - printer::inst()->print_msg(L0,"Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i , platformName.c_str()); + printer::inst()->print_msg(L0, "Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i, platformName.c_str()); if(platformName.find("Mesa") != std::string::npos) mesaPlatform = i; else @@ -695,12 +689,12 @@ int getAMDPlatformIdx() // fall back to Mesa OpenCL if(platformIndex == -1 && mesaPlatform != -1) { - printer::inst()->print_msg(L0,"No AMD platform found select Mesa as OpenCL platform"); + printer::inst()->print_msg(L0, "No AMD platform found select Mesa as OpenCL platform"); platformIndex = mesaPlatform; } } else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); free(platforms); return platformIndex; @@ -716,15 +710,14 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) if((ret = clGetPlatformIDs(0, NULL, &entries)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret)); return ERR_OCL_API; } - // The number of platforms naturally is the index of the last platform plus one. if(entries <= platform_idx) { - printer::inst()->print_msg(L1,"Selected OpenCL platform index %d doesn't exist.", platform_idx); + printer::inst()->print_msg(L1, "Selected OpenCL platform index %d doesn't exist.", platform_idx); return ERR_STUPID_PARAMS; } @@ -736,7 +729,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) #endif if((ret = clGetPlatformIDs(entries, PlatformIDList, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret)); return ERR_OCL_API; } @@ -747,12 +740,12 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) std::string platformName(platformNameVec.data()); if(xmrstak::params::inst().openCLVendor == "AMD" && platformName.find("Advanced Micro Devices") == std::string::npos) { - printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str()); + printer::inst()->print_msg(L1, "WARNING: using non AMD device: %s", platformName.c_str()); } if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &entries)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret)); return ERR_OCL_API; } @@ -761,7 +754,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) { if(ctx[i].deviceIdx >= entries) { - printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx); + printer::inst()->print_msg(L1, "Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx); return ERR_STUPID_PARAMS; } } @@ -773,7 +766,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) #endif if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, entries, DeviceIDList, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret)); return ERR_OCL_API; } @@ -790,41 +783,41 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) cl_context opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList.data(), NULL, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateContext.", err_to_str(ret)); return ERR_OCL_API; } - const char *fastIntMathV2CL = - #include "./opencl/fast_int_math_v2.cl" - ; - const char *fastDivHeavyCL = - #include "./opencl/fast_div_heavy.cl" - ; - const char *cryptonightCL = - #include "./opencl/cryptonight.cl" - ; - const char *blake256CL = - #include "./opencl/blake256.cl" - ; - const char *groestl256CL = - #include "./opencl/groestl256.cl" - ; - const char *jhCL = - #include "./opencl/jh.cl" - ; - const char *wolfAesCL = - #include "./opencl/wolf-aes.cl" - ; - const char *wolfSkeinCL = - #include "./opencl/wolf-skein.cl" - ; - const char *cryptonight_gpu = - #include "./opencl/cryptonight_gpu.cl" - ; + const char* fastIntMathV2CL = +#include "./opencl/fast_int_math_v2.cl" + ; + const char* fastDivHeavyCL = +#include "./opencl/fast_div_heavy.cl" + ; + const char* cryptonightCL = +#include "./opencl/cryptonight.cl" + ; + const char* blake256CL = +#include "./opencl/blake256.cl" + ; + const char* groestl256CL = +#include "./opencl/groestl256.cl" + ; + const char* jhCL = +#include "./opencl/jh.cl" + ; + const char* wolfAesCL = +#include "./opencl/wolf-aes.cl" + ; + const char* wolfSkeinCL = +#include "./opencl/wolf-skein.cl" + ; + const char* cryptonight_gpu = +#include "./opencl/cryptonight_gpu.cl" + ; std::string source_code(cryptonightCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); - source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); @@ -840,7 +833,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) for(int i = 0; i < num_gpus; ++i) { - printer::inst()->print_msg(LDEBUG,"OpenCL Init device %d", ctx[i].deviceIdx); + printer::inst()->print_msg(LDEBUG, "OpenCL Init device %d", ctx[i].deviceIdx); const size_t devIdx = ctx[i].deviceIdx; if(interleaveData.size() <= devIdx) { @@ -850,12 +843,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) { interleaveData[devIdx].reset(new InterleaveData{}); interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms(); - } - ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU; + ctx[i].idWorkerOnDevice = interleaveData[devIdx]->numThreadsOnGPU; ++interleaveData[devIdx]->numThreadsOnGPU; ctx[i].interleaveData = interleaveData[devIdx]; - ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave)/100.0; + ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave) / 100.0; ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold; ctx[i].opencl_ctx = opencl_ctx; @@ -871,7 +863,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height) { - auto & Kernels = ctx->Kernels[miner_algo.Id()]; + auto& Kernels = ctx->Kernels[miner_algo.Id()]; cl_int ret; @@ -885,35 +877,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret)); return ERR_OCL_API; } if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // Scratchpads if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); + return (ERR_OCL_API); } if(miner_algo == cryptonight_gpu) @@ -922,80 +914,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // Scratchpads if((ret = clSetKernelArg(Kernels[7], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[7], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } } - // CN1 Kernel + // CN1 Kernel - if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) { + if((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) + { - uint32_t PRECOMPILATION_DEPTH = 4; + uint32_t PRECOMPILATION_DEPTH = 1; + constexpr uint64_t height_chunk_size = 25; + uint64_t height_offset = (height / height_chunk_size) * height_chunk_size; - // Get new kernel - cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH); + // Get new kernel + cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset, height_chunk_size, PRECOMPILATION_DEPTH); - if (program != ctx->ProgramCryptonightR) { - cl_int ret; - cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret); + if(program != ctx->ProgramCryptonightR || ctx->last_block_height != height) + { + cl_int ret; + std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height); + cl_kernel kernel = clCreateKernel(program, kernel_name.c_str(), &ret); - if (ret != CL_SUCCESS) { - printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret)); - } - else + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret)); + } + else { - cl_kernel old_kernel = Kernels[1]; + cl_kernel old_kernel = Kernels[1]; if(old_kernel) clReleaseKernel(old_kernel); - Kernels[1] = kernel; - } - ctx->ProgramCryptonightR = program; + Kernels[1] = kernel; + } + ctx->ProgramCryptonightR = program; + ctx->last_block_height = height; + printer::inst()->print_msg(LDEBUG, "Set height %llu", height); - // Precompile next program in background - for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i) - xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true); + // Precompile next program in background + for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i) + xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset + i * height_chunk_size, height_chunk_size, PRECOMPILATION_DEPTH, true); - printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx); - } + printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx); + } else { printer::inst()->print_msg(LDEBUG, "Thread #%zu found CryptonightR", ctx->deviceIdx); } - } + } // Scratchpads if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); + return (ERR_OCL_API); } if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2) { // Input - if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret)); return ERR_OCL_API; @@ -1006,14 +1006,14 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // Scratchpads if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); return ERR_OCL_API; } @@ -1022,59 +1022,59 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // Output if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2); return ERR_OCL_API; } // Target if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + return (ERR_OCL_API); } } else - { + { // Branch 0 if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Branch 1 if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } // Branch 2 if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } // Branch 3 if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); + return (ERR_OCL_API); } for(int i = 0; i < 4; ++i) @@ -1082,35 +1082,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // States if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); return ERR_OCL_API; } // Nonce buffer if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); return ERR_OCL_API; } // Output if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); return ERR_OCL_API; } // Target if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; } if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return (ERR_OCL_API); } } } @@ -1134,7 +1134,7 @@ uint64_t updateTimings(GpuContext* ctx, const uint64_t t) if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0) ctx->interleaveData->avgKernelRuntime = runtime; else - ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias; + ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime)*averagingBias; } return runtime; } @@ -1163,7 +1163,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) if((dt > 0) && (dt < optimalTimeOffset)) { - delay = static_cast((optimalTimeOffset - dt)); + delay = static_cast((optimalTimeOffset - dt)); if(enableAutoAdjustment) { @@ -1182,8 +1182,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) // avoid that the auto adjustment is disable interleaving ctx->interleaveData->adjustThreshold = std::max( ctx->interleaveData->adjustThreshold, - 0.001 - ); + 0.001); } delay = std::max(int64_t(0), delay); @@ -1194,13 +1193,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) { // do not notify the user anymore if we reach a good delay if(delay > maxDelay) - printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf", + printer::inst()->print_msg(L1, "OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf", ctx->deviceIdx, ctx->idWorkerOnDevice, static_cast(delay), avgRuntime, - ctx->interleaveData->adjustThreshold * 100. - ); + ctx->interleaveData->adjustThreshold * 100.); std::this_thread::sleep_for(std::chrono::milliseconds(delay)); } @@ -1211,12 +1209,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo) { - const auto & Kernels = ctx->Kernels[miner_algo.Id()]; + const auto& Kernels = ctx->Kernels[miner_algo.Id()]; cl_int ret; cl_uint zero = 0; size_t BranchNonces[4]; - memset(BranchNonces,0,sizeof(size_t)*4); + memset(BranchNonces, 0, sizeof(size_t) * 4); size_t g_intensity = ctx->rawIntensity; size_t w_size = ctx->workSize; @@ -1227,28 +1225,28 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner // round up to next multiple of w_size g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size; // number of global threads must be a multiple of the work group size (w_size) - assert(g_thd%w_size == 0); + assert(g_thd % w_size == 0); } for(int i = 2; i < 6; ++i) { if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2); return ERR_OCL_API; } } if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; + size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = {g_thd, 8}, lthreads[2] = {8, 8}; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); return ERR_OCL_API; } @@ -1260,7 +1258,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner size_t intens = g_intensity * thd; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[7], 1, 0, &intens, &thd, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7); return ERR_OCL_API; } @@ -1269,7 +1267,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } } @@ -1277,25 +1275,25 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner { if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + size_t NonceT[2] = {0, ctx->Nonce}, gthreadsT[2] = {8, g_thd}, lthreadsT[2] = {8 , w_size}; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, NonceT, gthreadsT, lthreadsT, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); - return ERR_OCL_API; + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); + return ERR_OCL_API; } if(miner_algo != cryptonight_gpu) { for(int i = 0; i < 4; ++i) { - size_t tmpNonce = ctx->Nonce; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); return ERR_OCL_API; } } @@ -1304,11 +1302,11 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner // this call is blocking therefore the access to the results without cl_finish is fine if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - auto & numHashValues = HashOutput[0xFF]; + auto& numHashValues = HashOutput[0xFF]; // avoid out of memory read, we have only storage for 0xFF results if(numHashValues > 0xFF) numHashValues = 0xFF; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index ae2b506db..1ba300c7a 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -1,7 +1,7 @@ #pragma once -#include "xmrstak/misc/console.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" #if defined(__APPLE__) #include @@ -9,13 +9,13 @@ #include #endif +#include +#include +#include +#include #include #include #include -#include -#include -#include -#include #define ERR_SUCCESS (0) #define ERR_OCL_API (2) @@ -23,13 +23,13 @@ struct InterleaveData { - std::mutex mutex; + std::mutex mutex; - double adjustThreshold = 0.4; - double startAdjustThreshold = 0.4; - double avgKernelRuntime = 0.0; - uint64_t lastRunTimeStamp = 0; - uint32_t numThreadsOnGPU = 0; + double adjustThreshold = 0.4; + double startAdjustThreshold = 0.4; + double avgKernelRuntime = 0.0; + uint64_t lastRunTimeStamp = 0; + uint32_t numThreadsOnGPU = 0; }; struct GpuContext @@ -54,8 +54,9 @@ struct GpuContext cl_mem ExtraBuffers[6]; cl_context opencl_ctx = nullptr; std::map Program; - std::map> Kernels; + std::map> Kernels; cl_program ProgramCryptonightR = nullptr; + uint64_t last_block_height = 0u; size_t freeMem; size_t maxMemPerAlloc; int computeUnits; @@ -66,148 +67,147 @@ struct GpuContext uint64_t lastDelay = 0; uint32_t Nonce; - }; namespace { - const char* err_to_str(cl_int ret) +const char* err_to_str(cl_int ret) +{ + switch(ret) { - switch(ret) - { - case CL_SUCCESS: - return "CL_SUCCESS"; - case CL_DEVICE_NOT_FOUND: - return "CL_DEVICE_NOT_FOUND"; - case CL_DEVICE_NOT_AVAILABLE: - return "CL_DEVICE_NOT_AVAILABLE"; - case CL_COMPILER_NOT_AVAILABLE: - return "CL_COMPILER_NOT_AVAILABLE"; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case CL_OUT_OF_RESOURCES: - return "CL_OUT_OF_RESOURCES"; - case CL_OUT_OF_HOST_MEMORY: - return "CL_OUT_OF_HOST_MEMORY"; - case CL_PROFILING_INFO_NOT_AVAILABLE: - return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case CL_MEM_COPY_OVERLAP: - return "CL_MEM_COPY_OVERLAP"; - case CL_IMAGE_FORMAT_MISMATCH: - return "CL_IMAGE_FORMAT_MISMATCH"; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case CL_BUILD_PROGRAM_FAILURE: - return "CL_BUILD_PROGRAM_FAILURE"; - case CL_MAP_FAILURE: - return "CL_MAP_FAILURE"; - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; - #ifdef CL_VERSION_1_2 - case CL_COMPILE_PROGRAM_FAILURE: - return "CL_COMPILE_PROGRAM_FAILURE"; - case CL_LINKER_NOT_AVAILABLE: - return "CL_LINKER_NOT_AVAILABLE"; - case CL_LINK_PROGRAM_FAILURE: - return "CL_LINK_PROGRAM_FAILURE"; - case CL_DEVICE_PARTITION_FAILED: - return "CL_DEVICE_PARTITION_FAILED"; - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; - #endif - case CL_INVALID_VALUE: - return "CL_INVALID_VALUE"; - case CL_INVALID_DEVICE_TYPE: - return "CL_INVALID_DEVICE_TYPE"; - case CL_INVALID_PLATFORM: - return "CL_INVALID_PLATFORM"; - case CL_INVALID_DEVICE: - return "CL_INVALID_DEVICE"; - case CL_INVALID_CONTEXT: - return "CL_INVALID_CONTEXT"; - case CL_INVALID_QUEUE_PROPERTIES: - return "CL_INVALID_QUEUE_PROPERTIES"; - case CL_INVALID_COMMAND_QUEUE: - return "CL_INVALID_COMMAND_QUEUE"; - case CL_INVALID_HOST_PTR: - return "CL_INVALID_HOST_PTR"; - case CL_INVALID_MEM_OBJECT: - return "CL_INVALID_MEM_OBJECT"; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case CL_INVALID_IMAGE_SIZE: - return "CL_INVALID_IMAGE_SIZE"; - case CL_INVALID_SAMPLER: - return "CL_INVALID_SAMPLER"; - case CL_INVALID_BINARY: - return "CL_INVALID_BINARY"; - case CL_INVALID_BUILD_OPTIONS: - return "CL_INVALID_BUILD_OPTIONS"; - case CL_INVALID_PROGRAM: - return "CL_INVALID_PROGRAM"; - case CL_INVALID_PROGRAM_EXECUTABLE: - return "CL_INVALID_PROGRAM_EXECUTABLE"; - case CL_INVALID_KERNEL_NAME: - return "CL_INVALID_KERNEL_NAME"; - case CL_INVALID_KERNEL_DEFINITION: - return "CL_INVALID_KERNEL_DEFINITION"; - case CL_INVALID_KERNEL: - return "CL_INVALID_KERNEL"; - case CL_INVALID_ARG_INDEX: - return "CL_INVALID_ARG_INDEX"; - case CL_INVALID_ARG_VALUE: - return "CL_INVALID_ARG_VALUE"; - case CL_INVALID_ARG_SIZE: - return "CL_INVALID_ARG_SIZE"; - case CL_INVALID_KERNEL_ARGS: - return "CL_INVALID_KERNEL_ARGS"; - case CL_INVALID_WORK_DIMENSION: - return "CL_INVALID_WORK_DIMENSION"; - case CL_INVALID_WORK_GROUP_SIZE: - return "CL_INVALID_WORK_GROUP_SIZE"; - case CL_INVALID_WORK_ITEM_SIZE: - return "CL_INVALID_WORK_ITEM_SIZE"; - case CL_INVALID_GLOBAL_OFFSET: - return "CL_INVALID_GLOBAL_OFFSET"; - case CL_INVALID_EVENT_WAIT_LIST: - return "CL_INVALID_EVENT_WAIT_LIST"; - case CL_INVALID_EVENT: - return "CL_INVALID_EVENT"; - case CL_INVALID_OPERATION: - return "CL_INVALID_OPERATION"; - case CL_INVALID_GL_OBJECT: - return "CL_INVALID_GL_OBJECT"; - case CL_INVALID_BUFFER_SIZE: - return "CL_INVALID_BUFFER_SIZE"; - case CL_INVALID_MIP_LEVEL: - return "CL_INVALID_MIP_LEVEL"; - case CL_INVALID_GLOBAL_WORK_SIZE: - return "CL_INVALID_GLOBAL_WORK_SIZE"; - case CL_INVALID_PROPERTY: - return "CL_INVALID_PROPERTY"; - #ifdef CL_VERSION_1_2 - case CL_INVALID_IMAGE_DESCRIPTOR: - return "CL_INVALID_IMAGE_DESCRIPTOR"; - case CL_INVALID_COMPILER_OPTIONS: - return "CL_INVALID_COMPILER_OPTIONS"; - case CL_INVALID_LINKER_OPTIONS: - return "CL_INVALID_LINKER_OPTIONS"; - case CL_INVALID_DEVICE_PARTITION_COUNT: - return "CL_INVALID_DEVICE_PARTITION_COUNT"; - #endif - #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) - case CL_INVALID_PIPE_SIZE: - return "CL_INVALID_PIPE_SIZE"; - case CL_INVALID_DEVICE_QUEUE: - return "CL_INVALID_DEVICE_QUEUE"; - #endif - default: - return "UNKNOWN_ERROR"; - } + case CL_SUCCESS: + return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: + return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: + return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: + return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: + return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: + return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: + return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: + return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: + return "CL_MAP_FAILURE"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; +#ifdef CL_VERSION_1_2 + case CL_COMPILE_PROGRAM_FAILURE: + return "CL_COMPILE_PROGRAM_FAILURE"; + case CL_LINKER_NOT_AVAILABLE: + return "CL_LINKER_NOT_AVAILABLE"; + case CL_LINK_PROGRAM_FAILURE: + return "CL_LINK_PROGRAM_FAILURE"; + case CL_DEVICE_PARTITION_FAILED: + return "CL_DEVICE_PARTITION_FAILED"; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; +#endif + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: + return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: + return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: + return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: + return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: + return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: + return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: + return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: + return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: + return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: + return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: + return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: + return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: + return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: + return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: + return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: + return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: + return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: + return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: + return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: + return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: + return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: + return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: + return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: + return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: + return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: + return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: + return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: + return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: + return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: + return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: + return "CL_INVALID_GLOBAL_WORK_SIZE"; + case CL_INVALID_PROPERTY: + return "CL_INVALID_PROPERTY"; +#ifdef CL_VERSION_1_2 + case CL_INVALID_IMAGE_DESCRIPTOR: + return "CL_INVALID_IMAGE_DESCRIPTOR"; + case CL_INVALID_COMPILER_OPTIONS: + return "CL_INVALID_COMPILER_OPTIONS"; + case CL_INVALID_LINKER_OPTIONS: + return "CL_INVALID_LINKER_OPTIONS"; + case CL_INVALID_DEVICE_PARTITION_COUNT: + return "CL_INVALID_DEVICE_PARTITION_COUNT"; +#endif +#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) + case CL_INVALID_PIPE_SIZE: + return "CL_INVALID_PIPE_SIZE"; + case CL_INVALID_DEVICE_QUEUE: + return "CL_INVALID_DEVICE_QUEUE"; +#endif + default: + return "UNKNOWN_ERROR"; } } +} // namespace uint32_t getNumPlatforms(); int getAMDPlatformIdx(); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 12478aefb..d17b79215 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -198,7 +198,7 @@ inline void keccakf1600_1(ulong st[25]) } } )===" -R"===( + R"===( void keccakf1600_2(__local ulong *st) { @@ -372,7 +372,7 @@ inline int4 _mm_alignr_epi8(int4 a, const uint rot) #endif )===" -R"===( + R"===( void CNKeccak(ulong *output, ulong *input) { @@ -416,7 +416,7 @@ void AESExpandKey256(uint *keybuf) } )===" -R"===( + R"===( #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] @@ -577,7 +577,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } )===" -R"===( + R"===( // __NV_CL_C_VERSION checks if NVIDIA opencl is used #if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION)) @@ -868,8 +868,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states )===" R"===( +#if defined(__clang__) +# if __has_builtin(__builtin_amdgcn_ds_bpermute) +# define HAS_AMD_BPERMUTE 1 +# endif +#endif -__attribute__((reqd_work_group_size(8, 8, 1))) +__attribute__((reqd_work_group_size(8, WORKSIZE, 1))) __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, #if (ALGO == cryptonight_gpu) @@ -878,88 +883,123 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) #endif { - __local uint AES0[256], AES1[256], AES2[256], AES3[256]; - uint ExpandedKey2[40]; - uint4 text; + __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + uint ExpandedKey2[40]; + uint4 text; - const uint gIdx = getIdx(); + uint gIdx = get_global_id(1) - get_global_offset(1); + uint groupIdx = get_local_id(1); + uint lIdx = get_local_id(0); - for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) { - const uint tmp = AES0_C[i]; - AES0[i] = tmp; - AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); - } + for (int i = groupIdx * 8 + lIdx; i < 256; i += get_local_size(0) * get_local_size(1)) { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + AES2[i] = rotate(tmp, 16U); + AES3[i] = rotate(tmp, 24U); + } - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) - __local uint4 xin1[8][8]; - __local uint4 xin2[8][8]; + __local uint4 xin1[WORKSIZE][8]; + __local uint4 xin2[WORKSIZE][8]; #endif #if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) + // do not use early return here + if(gIdx < Threads) #endif - { - states += 25 * gIdx; + { + states += 25 * gIdx; #if(STRIDED_INDEX==0) - Scratchpad += gIdx * (MEMORY >> 4); + Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) - Scratchpad += gIdx; + Scratchpad += gIdx; #elif(STRIDED_INDEX==2) - Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); #elif(STRIDED_INDEX==3) - Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif - #if defined(__Tahiti__) || defined(__Pitcairn__) + #if defined(__Tahiti__) || defined(__Pitcairn__) - for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4]; - text = vload4(get_local_id(1) + 4, (__global uint *)states); + for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4]; + text = vload4(lIdx + 4, (__global uint *)states); - #else + #else + text = vload4(lIdx + 4, (__global uint *)states); + ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states); - text = vload4(get_local_id(1) + 4, (__global uint *)states); - ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states); + #endif - #endif + AESExpandKey256(ExpandedKey2); + } - AESExpandKey256(ExpandedKey2); - } - - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) - __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; - __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; - __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; - __local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)]; - *xin2_store = (uint4)(0, 0, 0, 0); +# if (HAS_AMD_BPERMUTE == 1) + int lane = (groupIdx * 8 + ((lIdx + 1) % 8)) << 2; + uint4 tmp = (uint4)(0, 0, 0, 0); +# else + __local uint4* xin1_store = &xin1[groupIdx][lIdx]; + __local uint4* xin1_load = &xin1[groupIdx][(lIdx + 1) % 8]; + __local uint4* xin2_store = &xin2[groupIdx][lIdx]; + __local uint4* xin2_load = &xin2[groupIdx][(lIdx + 1) % 8]; + *xin2_store = (uint4)(0, 0, 0, 0); +# endif #endif #if(COMP_MODE == 1) - // do not use early return here - if (gIdx < Threads) + // do not use early return here + if (gIdx < Threads) #endif - { + { #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + +# if (HAS_AMD_BPERMUTE == 1) + #pragma unroll 2 + for(int i = 0, i1 = lIdx; i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) + { + text ^= Scratchpad[IDX((uint)i1)]; + text ^= tmp; + + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + + text.s0 ^= __builtin_amdgcn_ds_bpermute(lane, text.s0); + text.s1 ^= __builtin_amdgcn_ds_bpermute(lane, text.s1); + text.s2 ^= __builtin_amdgcn_ds_bpermute(lane, text.s2); + text.s3 ^= __builtin_amdgcn_ds_bpermute(lane, text.s3); + //__builtin_amdgcn_s_waitcnt(0); + text ^= Scratchpad[IDX((uint)i1 + 8u)]; + + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + tmp.s0 = __builtin_amdgcn_ds_bpermute(lane, text.s0); + tmp.s1 = __builtin_amdgcn_ds_bpermute(lane, text.s1); + tmp.s2 = __builtin_amdgcn_ds_bpermute(lane, text.s2); + tmp.s3 = __builtin_amdgcn_ds_bpermute(lane, text.s3); + //__builtin_amdgcn_s_waitcnt(0); + } + + text ^= tmp; +# else + #pragma unroll 2 - for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) + for(int i = 0, i1 = lIdx; i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) { text ^= Scratchpad[IDX((uint)i1)]; barrier(CLK_LOCAL_MEM_FENCE); text ^= *xin2_load; - #pragma unroll 10 for(int j = 0; j < 10; ++j) text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - *xin1_store = text; - text ^= Scratchpad[IDX((uint)i1 + 8u)]; barrier(CLK_LOCAL_MEM_FENCE); text ^= *xin1_load; @@ -971,87 +1011,96 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states *xin2_store = text; } - barrier(CLK_LOCAL_MEM_FENCE); - text ^= *xin2_load; + barrier(CLK_LOCAL_MEM_FENCE); + text ^= *xin2_load; +# endif #else - #pragma unroll 2 - for (int i = 0; i < (MEMORY >> 7); ++i) { - text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))]; - - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - } + #pragma unroll 2 + for (int i = 0; i < (MEMORY >> 7); ++i) + { + text ^= Scratchpad[IDX((uint)((i << 3) + lIdx))]; + + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + } #endif - } + } #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) - /* Also left over threads performe this loop. - * The left over thread results will be ignored - */ - #pragma unroll 16 - for(size_t i = 0; i < 16; i++) - { - #pragma unroll 10 - for (int j = 0; j < 10; ++j) { - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - *xin1_store = text; - barrier(CLK_LOCAL_MEM_FENCE); - text ^= *xin1_load; - } + /* Also left over threads performe this loop. + * The left over thread results will be ignored + */ + #pragma unroll 16 + for(size_t i = 0; i < 16; i++) + { + #pragma unroll 10 + for (int j = 0; j < 10; ++j) { + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + } +#if (HAS_AMD_BPERMUTE == 1) + text.s0 ^= __builtin_amdgcn_ds_bpermute(lane, text.s0); + text.s1 ^= __builtin_amdgcn_ds_bpermute(lane, text.s1); + text.s2 ^= __builtin_amdgcn_ds_bpermute(lane, text.s2); + text.s3 ^= __builtin_amdgcn_ds_bpermute(lane, text.s3); + //__builtin_amdgcn_s_waitcnt(0); +#else + barrier(CLK_LOCAL_MEM_FENCE); + *xin1_store = text; + barrier(CLK_LOCAL_MEM_FENCE); + text ^= *xin1_load; +#endif + } #endif - __local ulong State_buf[8 * 25]; + __local ulong State_buf[8 * 25]; #if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) + // do not use early return here + if(gIdx < Threads) #endif - { - vstore2(as_ulong2(text), get_local_id(1) + 4, states); - } + { + vstore2(as_ulong2(text), lIdx + 4, states); + } - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); #if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) + // do not use early return here + if(gIdx < Threads) #endif - { - if(!get_local_id(1)) - { - __local ulong* State = State_buf + get_local_id(0) * 25; + { + if(!lIdx) + { + __local ulong* State = State_buf + groupIdx * 25; - for(int i = 0; i < 25; ++i) State[i] = states[i]; + for(int i = 0; i < 25; ++i) State[i] = states[i]; - keccakf1600_2(State); + keccakf1600_2(State); #if (ALGO == cryptonight_gpu) if(State[3] <= Target) { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = get_global_id(0); + output[outIdx] = get_global_id(1); } #else - for(int i = 0; i < 25; ++i) states[i] = State[i]; + for(int i = 0; i < 25; ++i) states[i] = State[i]; - uint StateSwitch = State[0] & 3; - __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1; - __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3; - __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2; - destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx; + uint StateSwitch = State[0] & 3; + __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1; + __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3; + __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2; + destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx; #endif - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); + } + } + mem_fence(CLK_GLOBAL_MEM_FENCE); } )===" -R"===( + R"===( #define VSWAP8(x) (((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \ | (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \ @@ -1307,7 +1356,42 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global #endif ulong H[8], M[8]; - for (uint i = 0; i < 3; ++i) { + // BUG: AMD driver 19.7.X crashs if this is written as loop + // Thx AMD for so bad software + uint i = 0; + { + ((ulong8 *)M)[0] = vload8(i, states); + + for (uint x = 0; x < 8; ++x) { + H[x] = M[x] ^ State[x]; + } + + PERM_SMALL_P(H); + PERM_SMALL_Q(M); + + for (uint x = 0; x < 8; ++x) + { + State[x] ^= H[x] ^ M[x]; + } + } + i = 1; + { + ((ulong8 *)M)[0] = vload8(i, states); + + for (uint x = 0; x < 8; ++x) { + H[x] = M[x] ^ State[x]; + } + + PERM_SMALL_P(H); + PERM_SMALL_Q(M); + + for (uint x = 0; x < 8; ++x) + { + State[x] ^= H[x] ^ M[x]; + } + } + i = 2; + { ((ulong8 *)M)[0] = vload8(i, states); for (uint x = 0; x < 8; ++x) { diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl index e87819760..bb37581f2 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -84,7 +84,7 @@ inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 } )===" -R"===( + R"===( static const __constant uint look[16][4] = { {0, 1, 2, 3}, @@ -220,7 +220,7 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u } )===" -R"===( + R"===( static const __constant uint skip[3] = { 20,22,22 diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl similarity index 88% rename from xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl rename to xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl index 9edb774ad..cdb5aef3e 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl @@ -1,4 +1,5 @@ R"===( + /* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -15,29 +16,15 @@ R"===( * */ -#define cryptonight_r_wow 15 -#define cryptonight_r 16 - -#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) - -#if(STRIDED_INDEX==0) -# define IDX(x) (x) -#elif(STRIDED_INDEX==1) -# define IDX(x) (mul24(((uint)(x)), Threads)) -#elif(STRIDED_INDEX==2) -# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) -#elif(STRIDED_INDEX==3) -# define IDX(x) ((x) * WORKSIZE) -#endif - +#ifndef SCRATCHPAD_CHUNK // __NV_CL_C_VERSION checks if NVIDIA opencl is used -#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) -# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4)))) -# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) -#else -# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)]) +# if((ALGO == cryptonight_r_wow || ALGO == cryptonight_r) && defined(__NV_CL_C_VERSION)) +# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4)))) +# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) +# else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)]) +# endif #endif - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads) { @@ -162,7 +149,9 @@ __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *stat #endif #define ROT_BITS 32 - XMRSTAK_INCLUDE_RANDOM_MATH +XMRSTAK_INCLUDE_RANDOM_MATH + +#undef ROT_BITS #if (ALGO == cryptonight_r) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl new file mode 100644 index 000000000..2c318fcbf --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl @@ -0,0 +1,33 @@ +R"===( +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#define cryptonight_r_wow 15 +#define cryptonight_r 16 + +#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) + +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#elif(STRIDED_INDEX==1) +# define IDX(x) (mul24(((uint)(x)), Threads)) +#elif(STRIDED_INDEX==2) +# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) +#elif(STRIDED_INDEX==3) +# define IDX(x) ((x) * WORKSIZE) +#endif + +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl index 22603853f..02ce53e03 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl @@ -125,7 +125,7 @@ static const __constant ulong T0_G[] = }; )===" -R"===( + R"===( static const __constant ulong T4_G[] = { @@ -292,4 +292,3 @@ static const __constant ulong T4_G[] = } while (0) )===" - diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl index 73ef90882..17abc3bc8 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl @@ -90,8 +90,85 @@ ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, con ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t) { - #pragma unroll - for(int i = 0; i < 18; ++i) + // BUG: AMD driver 19.7.X crashs if this is written as loop + // Thx AMD for so bad software + int i = 0; + { + p = SkeinEvenRound(p, h, t, 0U, i); + ++i; + ulong tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 2U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 0U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 2U, i); + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + ++i; + } + { + p = SkeinEvenRound(p, h, t, 0U, i); + ++i; + ulong tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 2U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 0U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinEvenRound(p, h, t, 1U, i); + ++i; + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + p = SkeinOddRound(p, h, t, 2U, i); + tmp = h.s0; + h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0)); + h.s7 = h8; + h8 = tmp; + ++i; + } { p = SkeinEvenRound(p, h, t, 0U, i); ++i; @@ -129,7 +206,6 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t) h.s7 = h8; h8 = tmp; } - p += h; p.s5 += t[0]; p.s6 += t[1]; diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 120fb6898..075acbd49 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -5,18 +5,18 @@ #include "autoAdjust.hpp" #include "jconf.hpp" -#include "xmrstak/misc/console.hpp" -#include "xmrstak/misc/configEditor.hpp" -#include "xmrstak/params.hpp" #include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/params.hpp" -#include +#include #include +#include #include #include -#include -#include +#include #if defined(__APPLE__) #include @@ -24,7 +24,6 @@ #include #endif - namespace xmrstak { namespace amd @@ -32,11 +31,9 @@ namespace amd class autoAdjust { -public: - + public: autoAdjust() { - } /** print the adjusted values if needed @@ -50,18 +47,17 @@ class autoAdjust if(platformIndex == -1) { - printer::inst()->print_msg(L0,"WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver."); + printer::inst()->print_msg(L0, "WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver."); return false; } devVec = getAMDDevices(platformIndex); - int deviceCount = devVec.size(); if(deviceCount == 0) { - printer::inst()->print_msg(L0,"WARNING: No AMD device found."); + printer::inst()->print_msg(L0, "WARNING: No AMD device found."); return false; } @@ -69,17 +65,16 @@ class autoAdjust return true; } -private: - + private: void generateThreadConfig(const int platformIndex) { // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; + const char* tpl = +#include "./config.tpl" + ; configEditor configTpl{}; - configTpl.set( std::string(tpl) ); + configTpl.set(std::string(tpl)); constexpr size_t byteToMiB = 1024u * 1024u; @@ -94,6 +89,42 @@ class autoAdjust std::string conf; for(auto& ctx : devVec) { + std::string enabledGpus = params::inst().amdGpus; + bool enabled = true; + if (!enabledGpus.empty()) + { + enabled = false; + std::stringstream ss(enabledGpus); + + int i = -1; + while (ss >> i) + { + if (i == ctx.deviceIdx) + { + enabled = true; + break; + } + + while (ss.peek() == ',' || ss.peek() == ' ') + ss.ignore(); + } + } + + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()); + + // true for all cryptonight_heavy derivates since we check the user and dev pool + bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); + + // true for cryptonight_gpu as main user pool algorithm + bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; + + bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r; + + bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow; + + // 8 threads per block (this is a good value for the most gpus) + uint32_t default_workSize = 8; size_t minFreeMem = 128u * byteToMiB; /* 1000 is a magic selected limit, the reason is that more than 2GiB memory * sowing down the memory performance because of TLB cache misses @@ -107,42 +138,31 @@ class autoAdjust // UNKNOWN ctx.name.compare("gfx900") == 0 || ctx.name.compare("gfx903") == 0 || - ctx.name.compare("gfx905") == 0 - ) + ctx.name.compare("gfx905") == 0 || + // Radeon VII + ctx.name.compare("gfx906") == 0 || + ctx.name.compare("Fiji") == 0) { /* Increase the number of threads for AMD VEGA gpus. * Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089 * to avoid out of memory errors */ maxThreads = 2024u; + + if(useCryptonight_gpu) + default_workSize = 16u; } // NVIDIA optimizations if( - ctx.isNVIDIA && ( - ctx.name.find("P100") != std::string::npos || - ctx.name.find("V100") != std::string::npos - ) - ) + ctx.isNVIDIA && (ctx.name.find("P100") != std::string::npos || + ctx.name.find("V100") != std::string::npos)) { // do not limit the number of threads maxThreads = 40000u; minFreeMem = 512u * byteToMiB; } - // check if cryptonight_monero_v8 is selected for the user or dev pool - bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()); - - // true for all cryptonight_heavy derivates since we check the user and dev pool - bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); - - // true for cryptonight_gpu as main user pool algorithm - bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; - - bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r; - - bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow; - // set strided index to default ctx.stridedIndex = 1; @@ -164,6 +184,7 @@ class autoAdjust } uint32_t numUnroll = 8; + uint32_t numThreads = 1u; if(useCryptonight_gpu) { @@ -171,7 +192,11 @@ class autoAdjust // @todo check again after all optimizations maxThreads = ctx.computeUnits * 6 * 8; ctx.stridedIndex = 0; - numUnroll = 1; + // do not change unroll for AMD RX5700 but set 2 threads per gpu + if(ctx.name.compare("gfx1010") == 0) + numThreads = 2; + else + numUnroll = 1; } // keep 128MiB memory free (value is randomly chosen) from the max available memory @@ -179,7 +204,6 @@ class autoAdjust size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem); - uint32_t numThreads = 1u; if(ctx.isAMD && !useCryptonight_gpu) { numThreads = 2; @@ -190,34 +214,42 @@ class autoAdjust // 240byte extra memory is used per thread for meta data size_t perThread = hashMemSize + 240u; size_t maxIntensity = memPerThread / perThread; - size_t possibleIntensity = std::min( maxThreads , maxIntensity ); - // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group - size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; - // in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units - if(numThreads == 2) - intensity = (possibleIntensity / 8) * 8; + size_t possibleIntensity = std::min(maxThreads, maxIntensity); + // map intensity to a multiple of the compute unit count, default_workSize is the number of threads per work group + size_t intensity = (possibleIntensity / (default_workSize * ctx.computeUnits)) * ctx.computeUnits * default_workSize; + + size_t computeUnitUtilization = ((possibleIntensity * 100) / (default_workSize * ctx.computeUnits)) % 100; + // in the case we use two threads per gpu or if we can utilize over 75% of the compute units + // we can be relax and need no multiple of the number of compute units + if(numThreads == 2 || computeUnitUtilization >= 75) + intensity = (possibleIntensity / default_workSize) * default_workSize; //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity - if (intensity == 0) + if(intensity == 0) { printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT."); intensity = possibleIntensity; - } - if (intensity != 0) + if(intensity != 0) { + if (!enabled) + conf += "/* Disabled\n"; + for(uint32_t thd = 0; thd < numThreads; ++thd) { conf += " // gpu: " + ctx.name + std::string(" compute units: ") + std::to_string(ctx.computeUnits) + "\n"; conf += " // memory:" + std::to_string(memPerThread / byteToMiB) + "|" + - std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n"; - // set 8 threads per block (this is a good value for the most gpus) + std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n"; conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + - " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" - " \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + - " },\n"; + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(default_workSize) + ",\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" + " \"unroll\" : " + + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + + " },\n"; } + + if (!enabled) + conf += "*/\n"; } else { @@ -225,8 +257,8 @@ class autoAdjust } } - configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex)); - configTpl.replace("GPUCONFIG",conf); + configTpl.replace("PLATFORMINDEX", std::to_string(platformIndex)); + configTpl.replace("GPUCONFIG", conf); configTpl.write(params::inst().configFileAMD); const std::string backendName = xmrstak::params::inst().openCLVendor; diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index d3dc00d01..c5a63c56f 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -21,10 +21,9 @@ * */ - #include "jconf.hpp" -#include "xmrstak/misc/jext.hpp" #include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/jext.hpp" #ifdef _WIN32 #define strcasecmp _stricmp @@ -37,7 +36,6 @@ #include #include - namespace xmrstak { namespace amd @@ -48,9 +46,14 @@ using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { aGpuThreadsConf, iPlatformIdx }; +enum configEnum +{ + aGpuThreadsConf, + iPlatformIdx +}; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -59,24 +62,25 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aGpuThreadsConf, "gpu_threads_conf", kNullType }, - { iPlatformIdx, "platform_index", kNumberType } -}; - -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + {aGpuThreadsConf, "gpu_threads_conf", kNullType}, + {iPlatformIdx, "platform_index", kNumberType}}; +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); -enum optionalConfigEnum { iAutoTune }; +enum optionalConfigEnum +{ + iAutoTune +}; -struct optionalConfigVal { +struct optionalConfigVal +{ optionalConfigEnum iName; const char* sName; Type iType; }; optionalConfigVal oOptionalConfigValues[] = { - { iAutoTune, "auto_tune", kNumberType } -}; + {iAutoTune, "auto_tune", kNumberType}}; inline bool checkType(Type have, Type want) { @@ -109,7 +113,7 @@ jconf::jconf() prv = new opaque_private(); } -bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) +bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg) { if(id >= prv->configValues[aGpuThreadsConf]->Size()) return false; @@ -176,7 +180,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return false; } - if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 ) + if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18) { printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18"); return false; @@ -215,7 +219,7 @@ size_t jconf::GetPlatformIdx() size_t jconf::GetAutoTune() { const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName); - if( value != nullptr && value->IsUint64()) + if(value != nullptr && value->IsUint64()) { return value->GetUint64(); } @@ -233,22 +237,22 @@ size_t jconf::GetThreadCount() bool jconf::parse_config(const char* sFilename) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -262,7 +266,7 @@ bool jconf::parse_config(const char* sFilename) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -284,7 +288,7 @@ bool jconf::parse_config(const char* sFilename) buffer[flen] = '}'; buffer[flen + 1] = '\0'; - prv->jsonDoc.Parse(buffer, flen+2); + prv->jsonDoc.Parse(buffer, flen + 2); free(buffer); if(prv->jsonDoc.HasParseError()) @@ -294,7 +298,6 @@ bool jconf::parse_config(const char* sFilename) return false; } - if(!prv->jsonDoc.IsObject()) { //This should never happen as we created the root ourselves printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename); @@ -326,7 +329,7 @@ bool jconf::parse_config(const char* sFilename) size_t n_thd = prv->configValues[aGpuThreadsConf]->Size(); thd_cfg c; - for(size_t i=0; i < n_thd; i++) + for(size_t i = 0; i < n_thd; i++) { if(!GetThreadConfig(i, c)) { diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index 51a0c79ac..6f50c3059 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -12,16 +12,18 @@ namespace amd class jconf { -public: + public: static jconf* inst() { - if (oInst == nullptr) oInst = new jconf; + if(oInst == nullptr) + oInst = new jconf; return oInst; }; bool parse_config(const char* sFilename = params::inst().configFileAMD.c_str()); - struct thd_cfg { + struct thd_cfg + { size_t index; size_t intensity; size_t w_size; @@ -34,18 +36,17 @@ class jconf }; size_t GetThreadCount(); - bool GetThreadConfig(size_t id, thd_cfg &cfg); + bool GetThreadConfig(size_t id, thd_cfg& cfg); size_t GetAutoTune(); size_t GetPlatformIdx(); -private: + private: jconf(); static jconf* oInst; struct opaque_private; opaque_private* prv; - }; } // namespace amd diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 3be593175..3a65de8e2 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -22,23 +22,23 @@ */ #include "minethd.hpp" -#include "autoAdjust.hpp" #include "amd_gpu/gpu.hpp" +#include "autoAdjust.hpp" -#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" #include "xmrstak/backend/cpu/crypto/cryptonight.h" -#include "xmrstak/misc/configEditor.hpp" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" +#include "xmrstak/backend/cpu/hwlocMemory.hpp" #include "xmrstak/backend/cpu/minethd.hpp" #include "xmrstak/jconf.hpp" -#include "xmrstak/misc/executor.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/misc/environment.hpp" +#include "xmrstak/misc/executor.hpp" #include "xmrstak/params.hpp" -#include "xmrstak/backend/cpu/hwlocMemory.hpp" #include -#include #include +#include #include #include @@ -53,6 +53,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th oWork = pWork; bQuit = 0; iThreadNo = (uint8_t)iNo; + this->iGpuIndex = cfg.index; iJobNo = 0; iHashCount = 0; iTimestamp = 0; @@ -72,15 +73,16 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th printer::inst()->print_msg(L1, "WARNING setting affinity failed."); } -extern "C" { +extern "C" +{ #ifdef WIN32 -__declspec(dllexport) + __declspec(dllexport) #endif -std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) -{ - environment::inst(&env); - return amd::minethd::thread_starter(threadOffset, pWork); -} + std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) + { + environment::inst(&env); + return amd::minethd::thread_starter(threadOffset, pWork); + } } // extern "C" bool minethd::init_gpus() @@ -137,7 +139,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor pvThreads->reserve(n); jconf::thd_cfg cfg; - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { jconf::inst()->GetThreadConfig(i, cfg); @@ -161,7 +163,6 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor return pvThreads; } - void minethd::work_main() { if(affinity >= 0) //-1 means no affinity @@ -169,10 +170,9 @@ void minethd::work_main() order_fix.set_value(); std::unique_lock lck(thd_aff_set); - lck.release(); + lck.unlock(); std::this_thread::yield(); - uint64_t iCount = 0; cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); @@ -204,16 +204,16 @@ void minethd::work_main() double bestHashrate = 0.0; uint32_t bestIntensity = pGpuCtx->maxRawIntensity; - while (bQuit == 0) + while(bQuit == 0) { - if (oWork.bStall) + if(oWork.bStall) { /* We are stalled here because the executor didn't find a job for us yet, * either because of network latency, or a socket problem. Since we are * raison d'etre of this software it us sensible to just wait until we have something */ - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); globalStates::inst().consume_work(oWork, iJobNo); @@ -267,14 +267,14 @@ void minethd::work_main() uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave); cl_uint results[0x100]; - memset(results,0,sizeof(cl_uint)*(0x100)); + memset(results, 0, sizeof(cl_uint) * (0x100)); XMRRunJob(pGpuCtx, results, miner_algo); for(size_t i = 0; i < results[0xFF]; i++) { - uint8_t bWorkBlob[128]; - uint8_t bResult[32]; + uint8_t bWorkBlob[128]; + uint8_t bResult[32]; memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); memset(bResult, 0, sizeof(job_result::bResult)); @@ -282,16 +282,13 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = results[i]; cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo); - if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) + if((*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId)); } - iCount += pGpuCtx->rawIntensity; - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); + updateStats(pGpuCtx->rawIntensity, oWork.iPoolId); accRuntime += updateTimings(pGpuCtx, t0); @@ -317,20 +314,18 @@ void minethd::work_main() // lock intensity to the best values autoTune = 0; pGpuCtx->rawIntensity = bestIntensity; - printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u", + printer::inst()->print_msg(L1, "OpenCL %u|%u: lock intensity at %u", pGpuCtx->deviceIdx, pGpuCtx->idWorkerOnDevice, - bestIntensity - ); + bestIntensity); } else { - printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u", + printer::inst()->print_msg(L1, "OpenCL %u|%u: auto-tune validate intensity %u|%u", pGpuCtx->deviceIdx, pGpuCtx->idWorkerOnDevice, pGpuCtx->rawIntensity, - bestIntensity - ); + bestIntensity); } // update gpu with new intensity XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height); diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 402d63cd6..579abb1b5 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -3,27 +3,26 @@ #include "amd_gpu/gpu.hpp" #include "jconf.hpp" #include "xmrstak/backend/cpu/crypto/cryptonight.h" -#include "xmrstak/backend/miner_work.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/backend/miner_work.hpp" #include "xmrstak/misc/environment.hpp" -#include #include #include +#include namespace xmrstak { namespace amd { -class minethd : public iBackend +class minethd : public iBackend { -public: - + public: static std::vector* thread_starter(uint32_t threadOffset, miner_work& pWork); static bool init_gpus(); -private: + private: typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg); diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp index 0eea9fdd7..93a8fd9d6 100644 --- a/xmrstak/backend/backendConnector.cpp +++ b/xmrstak/backend/backendConnector.cpp @@ -21,31 +21,30 @@ * */ -#include "iBackend.hpp" #include "backendConnector.hpp" -#include "miner_work.hpp" #include "globalStates.hpp" +#include "iBackend.hpp" +#include "miner_work.hpp" #include "plugin.hpp" -#include "xmrstak/misc/environment.hpp" #include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/environment.hpp" #include "xmrstak/params.hpp" #include "cpu/minethd.hpp" #ifndef CONF_NO_CUDA -# include "nvidia/minethd.hpp" +#include "nvidia/minethd.hpp" #endif #ifndef CONF_NO_OPENCL -# include "amd/minethd.hpp" +#include "amd/minethd.hpp" #endif -#include #include -#include +#include #include +#include +#include #include #include -#include - namespace xmrstak { @@ -82,31 +81,52 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork) #ifndef CONF_NO_CUDA if(params::inst().useNVIDIA) { + bool disableNvidia = false; + plugin nvidiaplugin; - std::vector libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"}; +#ifdef XMRSTAK_DEV_RELEASE + std::vector libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend"}; +# ifndef _WIN32 + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + bool cn_r_derivate = + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end(); + + if(cn_r_derivate) + { + disableNvidia = true; + printer::inst()->print_msg(L0, "WARNING: The linux release binaries not support cryptonight_r derived coins for NVIDIA."); + } +# endif +#else + std::vector libNames = {"xmrstak_cuda_backend"}; +#endif size_t numWorkers = 0u; - for( const auto & name : libNames) + if(!disableNvidia) { - printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str()); - nvidiaplugin.load("NVIDIA", name); - std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); - if(nvidiaThreads != nullptr) - { - pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads)); - numWorkers = nvidiaThreads->size(); - delete nvidiaThreads; - } - else - { - // remove the plugin if we have found no GPUs - nvidiaplugin.unload(); - } - // we found at leat one working GPU - if(numWorkers != 0) + for(const auto& name : libNames) { - printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str()); - break; + printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str()); + nvidiaplugin.load("NVIDIA", name); + std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); + if(nvidiaThreads != nullptr) + { + pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads)); + numWorkers = nvidiaThreads->size(); + delete nvidiaThreads; + } + else + { + // remove the plugin if we have found no GPUs + nvidiaplugin.unload(); + } + // we found at leat one working GPU + if(numWorkers != 0) + { + printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str()); + break; + } } } if(numWorkers == 0) diff --git a/xmrstak/backend/backendConnector.hpp b/xmrstak/backend/backendConnector.hpp index 66d873e48..1f2cb8ff6 100644 --- a/xmrstak/backend/backendConnector.hpp +++ b/xmrstak/backend/backendConnector.hpp @@ -3,19 +3,18 @@ #include "iBackend.hpp" #include "miner_work.hpp" -#include -#include #include #include - +#include +#include namespace xmrstak { - struct BackendConnector - { - static std::vector* thread_starter(miner_work& pWork); - static bool self_test(); - }; +struct BackendConnector +{ + static std::vector* thread_starter(miner_work& pWork); + static bool self_test(); +}; } // namespace xmrstak diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index ba0e6984f..98c145004 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -2,12 +2,12 @@ #include "jconf.hpp" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include "xmrstak/backend/cryptonight.hpp" -#include "xmrstak/backend/cpu/cpuType.hpp" #include #ifdef _WIN32 @@ -16,7 +16,6 @@ #include #endif // _WIN32 - namespace xmrstak { namespace cpu @@ -24,8 +23,7 @@ namespace cpu class autoAdjust { -public: - + public: bool printConfig() { auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); @@ -42,10 +40,10 @@ class autoAdjust configEditor configTpl{}; // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; - configTpl.set( std::string(tpl) ); + const char* tpl = +#include "./config.tpl" + ; + configTpl.set(std::string(tpl)); std::string conf; @@ -75,14 +73,14 @@ class autoAdjust linux_layout ? "Linux" : "Windows"); uint32_t aff_id = 0; - for(uint32_t i=0; i < corecnt; i++) + for(uint32_t i = 0; i < corecnt; i++) { bool double_mode; if(L3KB_size <= 0) break; - double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt-i); + double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt - i); conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); @@ -110,14 +108,14 @@ class autoAdjust if(useCryptonight_gpu) conf += "*/\n"; - configTpl.replace("CPUCONFIG",conf); + configTpl.replace("CPUCONFIG", conf); configTpl.write(params::inst().configFileCPU); printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); return true; } -private: + private: bool detectL3Size() { int32_t cpu_info[4]; @@ -125,8 +123,8 @@ class autoAdjust ::jconf::cpuid(0, 0, cpu_info); memcpy(cpustr, &cpu_info[1], 4); - memcpy(cpustr+4, &cpu_info[3], 4); - memcpy(cpustr+8, &cpu_info[2], 4); + memcpy(cpustr + 4, &cpu_info[3], 4); + memcpy(cpustr + 8, &cpu_info[2], 4); if(strcmp(cpustr, "GenuineIntel") == 0) { @@ -139,7 +137,8 @@ class autoAdjust } L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) * - (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024; + (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / + 1024; return true; } diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index f09b1ebc0..f06b0d679 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -1,9 +1,9 @@ #pragma once -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include "xmrstak/backend/cryptonight.hpp" #ifdef _WIN32 #include @@ -16,17 +16,15 @@ #include #include - namespace xmrstak { namespace cpu { -class autoAdjust +class autoAdjustHwloc { public: - - autoAdjust() + autoAdjustHwloc() { auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); @@ -48,10 +46,10 @@ class autoAdjust configEditor configTpl{}; // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; - configTpl.set( std::string(tpl) ); + const char* tpl = +#include "./config.tpl" + ; + configTpl.set(std::string(tpl)); // if cryptonight_gpu is used we will disable cpu mining but provide a inactive config bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; @@ -62,6 +60,7 @@ class autoAdjust conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n"; } + bool is_successful = true; try { std::vector tlcs; @@ -69,7 +68,7 @@ class autoAdjust results.reserve(16); findChildrenCaches(hwloc_get_root_obj(topology), - [&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); } ); + [&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); }); if(tlcs.size() == 0) throw(std::runtime_error("The CPU doesn't seem to have a cache.")); @@ -88,34 +87,32 @@ class autoAdjust } catch(const std::runtime_error& err) { - // \todo add fallback to default auto adjust - conf += std::string(" { \"low_power_mode\" : false"); - conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n"); - printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what()); + is_successful = false; + printer::inst()->print_msg(L0, "Autoconf with hwloc FAILED: %s. Trying basic autoconf.", err.what()); } if(useCryptonight_gpu) conf += "*/\n"; - configTpl.replace("CPUCONFIG",conf); + configTpl.replace("CPUCONFIG", conf); configTpl.write(params::inst().configFileCPU); printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); /* Destroy topology object. */ hwloc_topology_destroy(topology); - return true; + return is_successful; } -private: + private: size_t hashMemSize = 0; size_t halfHashMemSize = 0; std::vector results; - template + template inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda) { - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) { if(obj->children[i]->type == type) lambda(obj->children[i]); @@ -133,10 +130,10 @@ class autoAdjust #endif // HWLOC_API_VERSION } - template + template inline void findChildrenCaches(hwloc_obj_t obj, func lambda) { - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) { if(isCacheObject(obj->children[i])) lambda(obj->children[i]); @@ -159,7 +156,7 @@ class autoAdjust throw(std::runtime_error("Cache object hasn't got attributes.")); size_t PUs = 0; - findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; } ); + findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; }); //Strange case, but we will handle it silently, surely there must be one PU somewhere? if(PUs == 0) @@ -172,7 +169,7 @@ class autoAdjust throw(std::runtime_error("The CPU doesn't seem to have a cache.")); //Try our luck with lower level caches - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) processTopLevelCache(obj->children[i]); return; } @@ -180,7 +177,7 @@ class autoAdjust size_t cacheSize = obj->attr->cache.size; if(isCacheExclusive(obj)) { - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) { hwloc_obj_t l2obj = obj->children[i]; //If L2 is exclusive and greater or equal to 2MB add room for one more hash @@ -191,7 +188,7 @@ class autoAdjust std::vector cores; cores.reserve(16); - findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); } ); + findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); }); size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize; diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp index c85682d4f..5e2519c3b 100644 --- a/xmrstak/backend/cpu/cpuType.cpp +++ b/xmrstak/backend/cpu/cpuType.cpp @@ -1,9 +1,9 @@ #include "xmrstak/backend/cpu/cpuType.hpp" +#include #include #include -#include #ifdef _WIN32 #define strcasecmp _stricmp @@ -16,64 +16,63 @@ namespace xmrstak { namespace cpu { - void cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) - { - std::memset(val, 0, sizeof(int32_t)*4); - - #ifdef _WIN32 - __cpuidex(val, eax, ecx); - #else - __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); - #endif - } - - int32_t get_masked(int32_t val, int32_t h, int32_t l) - { - val &= (0x7FFFFFFF >> (31-(h-l))) << l; - return val >> l; - } +void cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) +{ + std::memset(val, 0, sizeof(int32_t) * 4); - bool has_feature(int32_t val, int32_t bit) - { - int32_t mask = 1 << bit; - return (val & mask) != 0u; +#ifdef _WIN32 + __cpuidex(val, eax, ecx); +#else + __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); +#endif +} - } +int32_t get_masked(int32_t val, int32_t h, int32_t l) +{ + val &= (0x7FFFFFFF >> (31 - (h - l))) << l; + return val >> l; +} - Model getModel() - { - int32_t cpu_info[4]; - char cpustr[13] = {0}; +bool has_feature(int32_t val, int32_t bit) +{ + int32_t mask = 1 << bit; + return (val & mask) != 0u; +} - cpuid(0, 0, cpu_info); - std::memcpy(cpustr, &cpu_info[1], 4); - std::memcpy(cpustr+4, &cpu_info[3], 4); - std::memcpy(cpustr+8, &cpu_info[2], 4); +Model getModel() +{ + int32_t cpu_info[4]; + char cpustr[13] = {0}; - Model result; + cpuid(0, 0, cpu_info); + std::memcpy(cpustr, &cpu_info[1], 4); + std::memcpy(cpustr + 4, &cpu_info[3], 4); + std::memcpy(cpustr + 8, &cpu_info[2], 4); - cpuid(1, 0, cpu_info); + Model result; - result.family = get_masked(cpu_info[0], 12, 8); - result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4; - result.type_name = cpustr; + cpuid(1, 0, cpu_info); - // feature bits https://en.wikipedia.org/wiki/CPUID - // sse2 - result.sse2 = has_feature(cpu_info[3], 26); - // aes-ni - result.aes = has_feature(cpu_info[2], 25); - // avx - 27 is the check if the OS overwrote cpu features - result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27) ; + result.family = get_masked(cpu_info[0], 12, 8); + result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4; + result.type_name = cpustr; - if(strcmp(cpustr, "AuthenticAMD") == 0) - { - if(result.family == 0xF) - result.family += get_masked(cpu_info[0], 28, 20); - } + // feature bits https://en.wikipedia.org/wiki/CPUID + // sse2 + result.sse2 = has_feature(cpu_info[3], 26); + // aes-ni + result.aes = has_feature(cpu_info[2], 25); + // avx - 27 is the check if the OS overwrote cpu features + result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27); - return result; + if(strcmp(cpustr, "AuthenticAMD") == 0) + { + if(result.family == 0xF) + result.family += get_masked(cpu_info[0], 28, 20); } + return result; +} + } // namespace cpu } // namespace xmrstak diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp index 7f6bfaf51..2bafa4105 100644 --- a/xmrstak/backend/cpu/cpuType.hpp +++ b/xmrstak/backend/cpu/cpuType.hpp @@ -1,32 +1,30 @@ #pragma once -#include #include - +#include namespace xmrstak { namespace cpu { - struct Model - { - uint32_t family = 0u; - uint32_t model = 0u; - bool aes = false; - bool sse2 = false; - bool avx = false; - std::string type_name = "unknown"; - }; +struct Model +{ + uint32_t family = 0u; + uint32_t model = 0u; + bool aes = false; + bool sse2 = false; + bool avx = false; + std::string type_name = "unknown"; +}; - Model getModel(); +Model getModel(); - /** Mask bits between h and l and return the value +/** Mask bits between h and l and return the value * * This enables us to put in values exactly like in the manual * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22) */ - int32_t get_masked(int32_t val, int32_t h, int32_t l); +int32_t get_masked(int32_t val, int32_t h, int32_t l); - } // namespace cpu } // namespace xmrstak diff --git a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp index 2fc1a8baa..5d55987ac 100644 --- a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp +++ b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp @@ -1,77 +1,87 @@ #include -typedef void(*void_func)(); +typedef void (*void_func)(); -#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h" -#include "cryptonight_aesni.h" #include "cryptonight.h" +#include "cryptonight_aesni.h" +#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h" #include "xmrstak/misc/console.hpp" -static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)()) +static inline void add_code(uint8_t*& p, void (*p1)(), void (*p2)()) { - const ptrdiff_t size = reinterpret_cast(p2) - reinterpret_cast(p1); - if (size > 0) { - memcpy(p, reinterpret_cast(p1), size); - p += size; - } + const ptrdiff_t size = reinterpret_cast(p2) - reinterpret_cast(p1); + if(size > 0) + { + memcpy(p, reinterpret_cast(p1), size); + p += size; + } } -static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm) +static inline void add_random_math(uint8_t*& p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm) { - uint32_t prev_rot_src = (uint32_t)(-1); - - for (int i = 0;; ++i) { - const V4_Instruction inst = code[i]; - if (inst.opcode == RET) { - break; - } - - uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2); - uint8_t dst_index = inst.dst_index; - uint8_t src_index = inst.src_index; - - const uint32_t a = inst.dst_index; - const uint32_t b = inst.src_index; - const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); - - switch (inst.opcode) { - case ROR: - case ROL: - if (b != prev_rot_src) { - prev_rot_src = b; - add_code(p, instructions_mov[c], instructions_mov[c + 1]); - } - break; - } - - if (a == prev_rot_src) { - prev_rot_src = (uint32_t)(-1); - } - - void_func begin = instructions[c]; + uint32_t prev_rot_src = (uint32_t)(-1); + + for(int i = 0;; ++i) + { + const V4_Instruction inst = code[i]; + if(inst.opcode == RET) + { + break; + } + + uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2); + uint8_t dst_index = inst.dst_index; + uint8_t src_index = inst.src_index; + + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; + const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); + + switch(inst.opcode) + { + case ROR: + case ROL: + if(b != prev_rot_src) + { + prev_rot_src = b; + add_code(p, instructions_mov[c], instructions_mov[c + 1]); + } + break; + } + + if(a == prev_rot_src) + { + prev_rot_src = (uint32_t)(-1); + } + + void_func begin = instructions[c]; // AMD == 2 - if ((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit)) { - // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL - // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41 - uint8_t* prefix = reinterpret_cast(begin); - - if (*prefix == 0x49) { - *(p++) = 0x41; - } - - begin = reinterpret_cast(prefix + 1); - } - - add_code(p, begin, instructions[c + 1]); - - if (inst.opcode == ADD) { - *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C; - if (is_64_bit) { - prev_rot_src = (uint32_t)(-1); - } - } - } + if((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit)) + { + // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL + // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41 + uint8_t* prefix = reinterpret_cast(begin); + + if(*prefix == 0x49) + { + *(p++) = 0x41; + } + + begin = reinterpret_cast(prefix + 1); + } + + add_code(p, begin, instructions[c + 1]); + + if(inst.opcode == ADD) + { + *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C; + if(is_64_bit) + { + prev_rot_src = (uint32_t)(-1); + } + } + } } void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size) @@ -84,14 +94,14 @@ void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size) else unprotectExecutableMemory(ctx->fun_data, allocation_size); - uint8_t* p0 = ctx->fun_data; - uint8_t* p = p0; + uint8_t* p0 = ctx->fun_data; + uint8_t* p = p0; if(ctx->fun_data != nullptr) { if(N == 2) { - add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2); + add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2); add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version); add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3); add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version); diff --git a/xmrstak/backend/cpu/crypto/c_blake256.c b/xmrstak/backend/cpu/crypto/c_blake256.c index e5fadfe74..93d9cadbb 100644 --- a/xmrstak/backend/cpu/crypto/c_blake256.c +++ b/xmrstak/backend/cpu/crypto/c_blake256.c @@ -8,66 +8,67 @@ * HMAC is specified by RFC 2104. */ -#include -#include -#include #include "c_blake256.h" +#include +#include +#include -#define U8TO32(p) \ - (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) -#define U32TO8(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); +#define U8TO32(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) +#define U32TO8(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); \ + (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); \ + (p)[3] = (uint8_t)((v)); const uint8_t sigma[][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}, - {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3}, - {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4}, - { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8}, - { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13}, - { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9}, - {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11}, - {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10}, - { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5}, - {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0}, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}, - {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3}, - {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4}, - { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8} -}; + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}}; const uint32_t cst[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, - 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 -}; + 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917}; static const uint8_t padding[] = { - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -}; - + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -void blake256_compress(state *S, const uint8_t *block) { +void blake256_compress(state* S, const uint8_t* block) +{ uint32_t v[16], m[16], i; -#define ROT(x,n) (((x)<<(32-n))|((x)>>(n))) -#define G(a,b,c,d,e) \ - v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e+1]]) + v[b]; \ - v[d] = ROT(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROT(v[b] ^ v[c],12); \ - v[a] += (m[sigma[i][e+1]] ^ cst[sigma[i][e]])+v[b]; \ - v[d] = ROT(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ +#define ROT(x, n) (((x) << (32 - n)) | ((x) >> (n))) +#define G(a, b, c, d, e) \ + v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e + 1]]) + v[b]; \ + v[d] = ROT(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROT(v[b] ^ v[c], 12); \ + v[a] += (m[sigma[i][e + 1]] ^ cst[sigma[i][e]]) + v[b]; \ + v[d] = ROT(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ v[b] = ROT(v[b] ^ v[c], 7); - for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); - for (i = 0; i < 8; ++i) v[i] = S->h[i]; - v[ 8] = S->s[0] ^ 0x243F6A88; - v[ 9] = S->s[1] ^ 0x85A308D3; + for(i = 0; i < 16; ++i) + m[i] = U8TO32(block + i * 4); + for(i = 0; i < 8; ++i) + v[i] = S->h[i]; + v[8] = S->s[0] ^ 0x243F6A88; + v[9] = S->s[1] ^ 0x85A308D3; v[10] = S->s[2] ^ 0x13198A2E; v[11] = S->s[3] ^ 0x03707344; v[12] = 0xA4093822; @@ -75,29 +76,34 @@ void blake256_compress(state *S, const uint8_t *block) { v[14] = 0x082EFA98; v[15] = 0xEC4E6C89; - if (S->nullt == 0) { + if(S->nullt == 0) + { v[12] ^= S->t[0]; v[13] ^= S->t[0]; v[14] ^= S->t[1]; v[15] ^= S->t[1]; } - for (i = 0; i < 14; ++i) { - G(0, 4, 8, 12, 0); - G(1, 5, 9, 13, 2); - G(2, 6, 10, 14, 4); - G(3, 7, 11, 15, 6); - G(3, 4, 9, 14, 14); - G(2, 7, 8, 13, 12); - G(0, 5, 10, 15, 8); + for(i = 0; i < 14; ++i) + { + G(0, 4, 8, 12, 0); + G(1, 5, 9, 13, 2); + G(2, 6, 10, 14, 4); + G(3, 7, 11, 15, 6); + G(3, 4, 9, 14, 14); + G(2, 7, 8, 13, 12); + G(0, 5, 10, 15, 8); G(1, 6, 11, 12, 10); } - for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; - for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; + for(i = 0; i < 16; ++i) + S->h[i % 8] ^= v[i]; + for(i = 0; i < 8; ++i) + S->h[i] ^= S->s[i % 4]; } -void blake256_init(state *S) { +void blake256_init(state* S) +{ S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; @@ -110,7 +116,8 @@ void blake256_init(state *S) { S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; } -void blake224_init(state *S) { +void blake224_init(state* S) +{ S->h[0] = 0xC1059ED8; S->h[1] = 0x367CD507; S->h[2] = 0x3070DD17; @@ -124,57 +131,75 @@ void blake224_init(state *S) { } // datalen = number of bits -void blake256_update(state *S, const uint8_t *data, uint32_t datalen) { +void blake256_update(state* S, const uint8_t* data, uint32_t datalen) +{ int left = S->buflen >> 3; int fill = 64 - left; - if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) { - memcpy((void *) (S->buf + left), (void *) data, fill); + if(left && (((datalen >> 3) & 0x3F) >= (unsigned)fill)) + { + memcpy((void*)(S->buf + left), (void*)data, fill); S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; blake256_compress(S, S->buf); data += fill; datalen -= (fill << 3); left = 0; } - while (datalen >= 512) { + while(datalen >= 512) + { S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; blake256_compress(S, data); data += 64; datalen -= 512; } - if (datalen > 0) { - memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); + if(datalen > 0) + { + memcpy((void*)(S->buf + left), (void*)data, datalen >> 3); S->buflen = (left << 3) + datalen; - } else { + } + else + { S->buflen = 0; } } // datalen = number of bits -void blake224_update(state *S, const uint8_t *data, uint32_t datalen) { +void blake224_update(state* S, const uint8_t* data, uint32_t datalen) +{ blake256_update(S, data, datalen); } -void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) { +void blake256_final_h(state* S, uint8_t* digest, uint8_t pa, uint8_t pb) +{ uint8_t msglen[8]; uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; - if (lo < (unsigned) S->buflen) hi++; + if(lo < (unsigned)S->buflen) + hi++; U32TO8(msglen + 0, hi); U32TO8(msglen + 4, lo); - if (S->buflen == 440) { /* one padding byte */ + if(S->buflen == 440) + { /* one padding byte */ S->t[0] -= 8; blake256_update(S, &pa, 8); - } else { - if (S->buflen < 440) { /* enough space to fill the block */ - if (S->buflen == 0) S->nullt = 1; + } + else + { + if(S->buflen < 440) + { /* enough space to fill the block */ + if(S->buflen == 0) + S->nullt = 1; S->t[0] -= 440 - S->buflen; blake256_update(S, padding, 440 - S->buflen); - } else { /* need 2 compressions */ + } + else + { /* need 2 compressions */ S->t[0] -= 512 - S->buflen; blake256_update(S, padding, 512 - S->buflen); S->t[0] -= 440; @@ -187,9 +212,9 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) { S->t[0] -= 64; blake256_update(S, msglen, 64); - U32TO8(digest + 0, S->h[0]); - U32TO8(digest + 4, S->h[1]); - U32TO8(digest + 8, S->h[2]); + U32TO8(digest + 0, S->h[0]); + U32TO8(digest + 4, S->h[1]); + U32TO8(digest + 8, S->h[2]); U32TO8(digest + 12, S->h[3]); U32TO8(digest + 16, S->h[4]); U32TO8(digest + 20, S->h[5]); @@ -197,16 +222,19 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) { U32TO8(digest + 28, S->h[7]); } -void blake256_final(state *S, uint8_t *digest) { +void blake256_final(state* S, uint8_t* digest) +{ blake256_final_h(S, digest, 0x81, 0x01); } -void blake224_final(state *S, uint8_t *digest) { +void blake224_final(state* S, uint8_t* digest) +{ blake256_final_h(S, digest, 0x80, 0x00); } // inlen = number of bytes -void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { +void blake256_hash(uint8_t* out, const uint8_t* in, uint32_t inlen) +{ state S; blake256_init(&S); blake256_update(&S, in, inlen * 8); @@ -214,7 +242,8 @@ void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { } // inlen = number of bytes -void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { +void blake224_hash(uint8_t* out, const uint8_t* in, uint32_t inlen) +{ state S; blake224_init(&S); blake224_update(&S, in, inlen * 8); @@ -222,13 +251,15 @@ void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { } // keylen = number of bytes -void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { - const uint8_t *key = _key; +void hmac_blake256_init(hmac_state* S, const uint8_t* _key, uint64_t keylen) +{ + const uint8_t* key = _key; uint8_t keyhash[32]; uint8_t pad[64]; uint64_t i; - if (keylen > 64) { + if(keylen > 64) + { blake256_hash(keyhash, key, keylen); key = keyhash; keylen = 32; @@ -236,14 +267,16 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { blake256_init(&S->inner); memset(pad, 0x36, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake256_update(&S->inner, pad, 512); blake256_init(&S->outer); memset(pad, 0x5c, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake256_update(&S->outer, pad, 512); @@ -252,13 +285,15 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { } // keylen = number of bytes -void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { - const uint8_t *key = _key; +void hmac_blake224_init(hmac_state* S, const uint8_t* _key, uint64_t keylen) +{ + const uint8_t* key = _key; uint8_t keyhash[32]; uint8_t pad[64]; uint64_t i; - if (keylen > 64) { + if(keylen > 64) + { blake256_hash(keyhash, key, keylen); key = keyhash; keylen = 28; @@ -266,14 +301,16 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { blake224_init(&S->inner); memset(pad, 0x36, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake224_update(&S->inner, pad, 512); blake224_init(&S->outer); memset(pad, 0x5c, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake224_update(&S->outer, pad, 512); @@ -282,18 +319,21 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { } // datalen = number of bits -void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint32_t datalen) { - // update the inner state - blake256_update(&S->inner, data, datalen); +void hmac_blake256_update(hmac_state* S, const uint8_t* data, uint32_t datalen) +{ + // update the inner state + blake256_update(&S->inner, data, datalen); } // datalen = number of bits -void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint32_t datalen) { - // update the inner state - blake224_update(&S->inner, data, datalen); +void hmac_blake224_update(hmac_state* S, const uint8_t* data, uint32_t datalen) +{ + // update the inner state + blake224_update(&S->inner, data, datalen); } -void hmac_blake256_final(hmac_state *S, uint8_t *digest) { +void hmac_blake256_final(hmac_state* S, uint8_t* digest) +{ uint8_t ihash[32]; blake256_final(&S->inner, ihash); blake256_update(&S->outer, ihash, 256); @@ -301,7 +341,8 @@ void hmac_blake256_final(hmac_state *S, uint8_t *digest) { memset(ihash, 0, 32); } -void hmac_blake224_final(hmac_state *S, uint8_t *digest) { +void hmac_blake224_final(hmac_state* S, uint8_t* digest) +{ uint8_t ihash[32]; blake224_final(&S->inner, ihash); blake224_update(&S->outer, ihash, 224); @@ -310,7 +351,8 @@ void hmac_blake224_final(hmac_state *S, uint8_t *digest) { } // keylen = number of bytes; inlen = number of bytes -void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) { +void hmac_blake256_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen) +{ hmac_state S; hmac_blake256_init(&S, key, keylen); hmac_blake256_update(&S, in, inlen * 8); @@ -318,7 +360,8 @@ void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const } // keylen = number of bytes; inlen = number of bytes -void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) { +void hmac_blake224_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen) +{ hmac_state S; hmac_blake224_init(&S, key, keylen); hmac_blake224_update(&S, in, inlen * 8); diff --git a/xmrstak/backend/cpu/crypto/c_blake256.h b/xmrstak/backend/cpu/crypto/c_blake256.h index 06c7917af..9f63f88f4 100644 --- a/xmrstak/backend/cpu/crypto/c_blake256.h +++ b/xmrstak/backend/cpu/crypto/c_blake256.h @@ -3,41 +3,43 @@ #include -typedef struct { - uint32_t h[8], s[4], t[2]; - int buflen, nullt; - uint8_t buf[64]; +typedef struct +{ + uint32_t h[8], s[4], t[2]; + int buflen, nullt; + uint8_t buf[64]; } state; -typedef struct { - state inner; - state outer; +typedef struct +{ + state inner; + state outer; } hmac_state; -void blake256_init(state *); -void blake224_init(state *); +void blake256_init(state*); +void blake224_init(state*); -void blake256_update(state *, const uint8_t *, uint32_t); -void blake224_update(state *, const uint8_t *, uint32_t); +void blake256_update(state*, const uint8_t*, uint32_t); +void blake224_update(state*, const uint8_t*, uint32_t); -void blake256_final(state *, uint8_t *); -void blake224_final(state *, uint8_t *); +void blake256_final(state*, uint8_t*); +void blake224_final(state*, uint8_t*); -void blake256_hash(uint8_t *, const uint8_t *, uint32_t); -void blake224_hash(uint8_t *, const uint8_t *, uint32_t); +void blake256_hash(uint8_t*, const uint8_t*, uint32_t); +void blake224_hash(uint8_t*, const uint8_t*, uint32_t); /* HMAC functions: */ -void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t); -void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t); +void hmac_blake256_init(hmac_state*, const uint8_t*, uint64_t); +void hmac_blake224_init(hmac_state*, const uint8_t*, uint64_t); -void hmac_blake256_update(hmac_state *, const uint8_t *, uint32_t); -void hmac_blake224_update(hmac_state *, const uint8_t *, uint32_t); +void hmac_blake256_update(hmac_state*, const uint8_t*, uint32_t); +void hmac_blake224_update(hmac_state*, const uint8_t*, uint32_t); -void hmac_blake256_final(hmac_state *, uint8_t *); -void hmac_blake224_final(hmac_state *, uint8_t *); +void hmac_blake256_final(hmac_state*, uint8_t*); +void hmac_blake224_final(hmac_state*, uint8_t*); -void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t); -void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t); +void hmac_blake256_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t); +void hmac_blake224_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t); #endif /* _BLAKE256_H_ */ diff --git a/xmrstak/backend/cpu/crypto/c_groestl.c b/xmrstak/backend/cpu/crypto/c_groestl.c index 5b3523e79..bae9a9f11 100644 --- a/xmrstak/backend/cpu/crypto/c_groestl.c +++ b/xmrstak/backend/cpu/crypto/c_groestl.c @@ -14,178 +14,185 @@ #define P_TYPE 0 #define Q_TYPE 1 -const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}}; - -const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6}; - - -#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \ - v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \ - v1 = temp_var;} - - -#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \ - tu = T[2*(uint32_t)x[4*c0+0]]; \ - tl = T[2*(uint32_t)x[4*c0+0]+1]; \ - tv1 = T[2*(uint32_t)x[4*c1+1]]; \ - tv2 = T[2*(uint32_t)x[4*c1+1]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c2+2]]; \ - tv2 = T[2*(uint32_t)x[4*c2+2]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c3+3]]; \ - tv2 = T[2*(uint32_t)x[4*c3+3]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tl ^= T[2*(uint32_t)x[4*c4+0]]; \ - tu ^= T[2*(uint32_t)x[4*c4+0]+1]; \ - tv1 = T[2*(uint32_t)x[4*c5+1]]; \ - tv2 = T[2*(uint32_t)x[4*c5+1]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c6+2]]; \ - tv2 = T[2*(uint32_t)x[4*c6+2]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c7+3]]; \ - tv2 = T[2*(uint32_t)x[4*c7+3]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - y[i] = tu; \ - y[i+1] = tl; +const uint8_t shift_Values[2][8] = {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 3, 5, 7, 0, 2, 4, 6}}; +const uint8_t indices_cyclic[15] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6}; + +#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \ + { \ + temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \ + v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes))); \ + v1 = temp_var; \ + } + +#define COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \ + tu = T[2 * (uint32_t)x[4 * c0 + 0]]; \ + tl = T[2 * (uint32_t)x[4 * c0 + 0] + 1]; \ + tv1 = T[2 * (uint32_t)x[4 * c1 + 1]]; \ + tv2 = T[2 * (uint32_t)x[4 * c1 + 1] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c2 + 2]]; \ + tv2 = T[2 * (uint32_t)x[4 * c2 + 2] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c3 + 3]]; \ + tv2 = T[2 * (uint32_t)x[4 * c3 + 3] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tl ^= T[2 * (uint32_t)x[4 * c4 + 0]]; \ + tu ^= T[2 * (uint32_t)x[4 * c4 + 0] + 1]; \ + tv1 = T[2 * (uint32_t)x[4 * c5 + 1]]; \ + tv2 = T[2 * (uint32_t)x[4 * c5 + 1] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c6 + 2]]; \ + tv2 = T[2 * (uint32_t)x[4 * c6 + 2] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c7 + 3]]; \ + tv2 = T[2 * (uint32_t)x[4 * c7 + 3] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + y[i] = tu; \ + y[i + 1] = tl; /* compute one round of P (short variants) */ -static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) { - uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; - uint32_t* x32 = (uint32_t*)x; - x32[ 0] ^= 0x00000000^r; - x32[ 2] ^= 0x00000010^r; - x32[ 4] ^= 0x00000020^r; - x32[ 6] ^= 0x00000030^r; - x32[ 8] ^= 0x00000040^r; - x32[10] ^= 0x00000050^r; - x32[12] ^= 0x00000060^r; - x32[14] ^= 0x00000070^r; - COLUMN(x,y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); +static void RND512P(uint8_t* x, uint32_t* y, uint32_t r) +{ + uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; + uint32_t* x32 = (uint32_t*)x; + x32[0] ^= 0x00000000 ^ r; + x32[2] ^= 0x00000010 ^ r; + x32[4] ^= 0x00000020 ^ r; + x32[6] ^= 0x00000030 ^ r; + x32[8] ^= 0x00000040 ^ r; + x32[10] ^= 0x00000050 ^ r; + x32[12] ^= 0x00000060 ^ r; + x32[14] ^= 0x00000070 ^ r; + COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } /* compute one round of Q (short variants) */ -static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) { - uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; - uint32_t* x32 = (uint32_t*)x; - x32[ 0] = ~x32[ 0]; - x32[ 1] ^= 0xffffffff^r; - x32[ 2] = ~x32[ 2]; - x32[ 3] ^= 0xefffffff^r; - x32[ 4] = ~x32[ 4]; - x32[ 5] ^= 0xdfffffff^r; - x32[ 6] = ~x32[ 6]; - x32[ 7] ^= 0xcfffffff^r; - x32[ 8] = ~x32[ 8]; - x32[ 9] ^= 0xbfffffff^r; - x32[10] = ~x32[10]; - x32[11] ^= 0xafffffff^r; - x32[12] = ~x32[12]; - x32[13] ^= 0x9fffffff^r; - x32[14] = ~x32[14]; - x32[15] ^= 0x8fffffff^r; - COLUMN(x,y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); +static void RND512Q(uint8_t* x, uint32_t* y, uint32_t r) +{ + uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; + uint32_t* x32 = (uint32_t*)x; + x32[0] = ~x32[0]; + x32[1] ^= 0xffffffff ^ r; + x32[2] = ~x32[2]; + x32[3] ^= 0xefffffff ^ r; + x32[4] = ~x32[4]; + x32[5] ^= 0xdfffffff ^ r; + x32[6] = ~x32[6]; + x32[7] ^= 0xcfffffff ^ r; + x32[8] = ~x32[8]; + x32[9] ^= 0xbfffffff ^ r; + x32[10] = ~x32[10]; + x32[11] ^= 0xafffffff ^ r; + x32[12] = ~x32[12]; + x32[13] ^= 0x9fffffff ^ r; + x32[14] = ~x32[14]; + x32[15] ^= 0x8fffffff ^ r; + COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } /* compute compression function (short variants) */ -static void F512(uint32_t *h, const uint32_t *m) { - int i; - uint32_t Ptmp[2*COLS512]; - uint32_t Qtmp[2*COLS512]; - uint32_t y[2*COLS512]; - uint32_t z[2*COLS512]; - - for (i = 0; i < 2*COLS512; i++) { - z[i] = m[i]; - Ptmp[i] = h[i]^m[i]; - } - - /* compute Q(m) */ - RND512Q((uint8_t*)z, y, 0x00000000); - RND512Q((uint8_t*)y, z, 0x01000000); - RND512Q((uint8_t*)z, y, 0x02000000); - RND512Q((uint8_t*)y, z, 0x03000000); - RND512Q((uint8_t*)z, y, 0x04000000); - RND512Q((uint8_t*)y, z, 0x05000000); - RND512Q((uint8_t*)z, y, 0x06000000); - RND512Q((uint8_t*)y, z, 0x07000000); - RND512Q((uint8_t*)z, y, 0x08000000); - RND512Q((uint8_t*)y, Qtmp, 0x09000000); - - /* compute P(h+m) */ - RND512P((uint8_t*)Ptmp, y, 0x00000000); - RND512P((uint8_t*)y, z, 0x00000001); - RND512P((uint8_t*)z, y, 0x00000002); - RND512P((uint8_t*)y, z, 0x00000003); - RND512P((uint8_t*)z, y, 0x00000004); - RND512P((uint8_t*)y, z, 0x00000005); - RND512P((uint8_t*)z, y, 0x00000006); - RND512P((uint8_t*)y, z, 0x00000007); - RND512P((uint8_t*)z, y, 0x00000008); - RND512P((uint8_t*)y, Ptmp, 0x00000009); - - /* compute P(h+m) + Q(m) + h */ - for (i = 0; i < 2*COLS512; i++) { - h[i] ^= Ptmp[i]^Qtmp[i]; - } -} +static void F512(uint32_t* h, const uint32_t* m) +{ + int i; + uint32_t Ptmp[2 * COLS512]; + uint32_t Qtmp[2 * COLS512]; + uint32_t y[2 * COLS512]; + uint32_t z[2 * COLS512]; + + for(i = 0; i < 2 * COLS512; i++) + { + z[i] = m[i]; + Ptmp[i] = h[i] ^ m[i]; + } + /* compute Q(m) */ + RND512Q((uint8_t*)z, y, 0x00000000); + RND512Q((uint8_t*)y, z, 0x01000000); + RND512Q((uint8_t*)z, y, 0x02000000); + RND512Q((uint8_t*)y, z, 0x03000000); + RND512Q((uint8_t*)z, y, 0x04000000); + RND512Q((uint8_t*)y, z, 0x05000000); + RND512Q((uint8_t*)z, y, 0x06000000); + RND512Q((uint8_t*)y, z, 0x07000000); + RND512Q((uint8_t*)z, y, 0x08000000); + RND512Q((uint8_t*)y, Qtmp, 0x09000000); + + /* compute P(h+m) */ + RND512P((uint8_t*)Ptmp, y, 0x00000000); + RND512P((uint8_t*)y, z, 0x00000001); + RND512P((uint8_t*)z, y, 0x00000002); + RND512P((uint8_t*)y, z, 0x00000003); + RND512P((uint8_t*)z, y, 0x00000004); + RND512P((uint8_t*)y, z, 0x00000005); + RND512P((uint8_t*)z, y, 0x00000006); + RND512P((uint8_t*)y, z, 0x00000007); + RND512P((uint8_t*)z, y, 0x00000008); + RND512P((uint8_t*)y, Ptmp, 0x00000009); + + /* compute P(h+m) + Q(m) + h */ + for(i = 0; i < 2 * COLS512; i++) + { + h[i] ^= Ptmp[i] ^ Qtmp[i]; + } +} /* digest up to msglen bytes of input (full blocks only) */ -static void Transform(groestlHashState *ctx, - const uint8_t *input, - int msglen) { +static void Transform(groestlHashState* ctx, + const uint8_t* input, + int msglen) +{ - /* digest message, one block at a time */ - for (; msglen >= SIZE512; - msglen -= SIZE512, input += SIZE512) { - F512(ctx->chaining,(uint32_t*)input); + /* digest message, one block at a time */ + for(; msglen >= SIZE512; + msglen -= SIZE512, input += SIZE512) + { + F512(ctx->chaining, (uint32_t*)input); - /* increment block counter */ - ctx->block_counter1++; - if (ctx->block_counter1 == 0) ctx->block_counter2++; - } + /* increment block counter */ + ctx->block_counter1++; + if(ctx->block_counter1 == 0) + ctx->block_counter2++; + } } /* given state h, do h <- P(h)+h */ -static void OutputTransformation(groestlHashState *ctx) { - int j; - uint32_t temp[2*COLS512]; - uint32_t y[2*COLS512]; - uint32_t z[2*COLS512]; - - - - for (j = 0; j < 2*COLS512; j++) { - temp[j] = ctx->chaining[j]; +static void OutputTransformation(groestlHashState* ctx) +{ + int j; + uint32_t temp[2 * COLS512]; + uint32_t y[2 * COLS512]; + uint32_t z[2 * COLS512]; + + for(j = 0; j < 2 * COLS512; j++) + { + temp[j] = ctx->chaining[j]; } RND512P((uint8_t*)temp, y, 0x00000000); RND512P((uint8_t*)y, z, 0x00000001); @@ -197,75 +204,84 @@ static void OutputTransformation(groestlHashState *ctx) { RND512P((uint8_t*)y, z, 0x00000007); RND512P((uint8_t*)z, y, 0x00000008); RND512P((uint8_t*)y, temp, 0x00000009); - for (j = 0; j < 2*COLS512; j++) { - ctx->chaining[j] ^= temp[j]; + for(j = 0; j < 2 * COLS512; j++) + { + ctx->chaining[j] ^= temp[j]; } } /* initialise context */ -static void Init(groestlHashState* ctx) { - int i = 0; - /* allocate memory for state and data buffer */ - - for(;i<(SIZE512/sizeof(uint32_t));i++) - { - ctx->chaining[i] = 0; - } - - /* set initial value */ - ctx->chaining[2*COLS512-1] = u32BIG((uint32_t)HASH_BIT_LEN); - - /* set other variables */ - ctx->buf_ptr = 0; - ctx->block_counter1 = 0; - ctx->block_counter2 = 0; - ctx->bits_in_last_byte = 0; +static void Init(groestlHashState* ctx) +{ + int i = 0; + /* allocate memory for state and data buffer */ + + for(; i < (SIZE512 / sizeof(uint32_t)); i++) + { + ctx->chaining[i] = 0; + } + + /* set initial value */ + ctx->chaining[2 * COLS512 - 1] = u32BIG((uint32_t)HASH_BIT_LEN); + + /* set other variables */ + ctx->buf_ptr = 0; + ctx->block_counter1 = 0; + ctx->block_counter2 = 0; + ctx->bits_in_last_byte = 0; } /* update state with databitlen bits of input */ static void Update(groestlHashState* ctx, - const BitSequence* input, - DataLength databitlen) { - int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); + const BitSequence* input, + DataLength databitlen) +{ + int index = 0; + int msglen = (int)(databitlen / 8); + int rem = (int)(databitlen % 8); - /* if the buffer contains data that has not yet been digested, first + /* if the buffer contains data that has not yet been digested, first add data to buffer until full */ - if (ctx->buf_ptr) { - while (ctx->buf_ptr < SIZE512 && index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } - if (ctx->buf_ptr < SIZE512) { - /* buffer still not full, return */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } - return; + if(ctx->buf_ptr) + { + while(ctx->buf_ptr < SIZE512 && index < msglen) + { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } + if(ctx->buf_ptr < SIZE512) + { + /* buffer still not full, return */ + if(rem) + { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } + return; + } + + /* digest buffer */ + ctx->buf_ptr = 0; + Transform(ctx, ctx->buffer, SIZE512); } - /* digest buffer */ - ctx->buf_ptr = 0; - Transform(ctx, ctx->buffer, SIZE512); - } + /* digest bulk of message */ + Transform(ctx, input + index, msglen - index); + index += ((msglen - index) / SIZE512) * SIZE512; - /* digest bulk of message */ - Transform(ctx, input+index, msglen-index); - index += ((msglen-index)/SIZE512)*SIZE512; - - /* store remaining data in buffer */ - while (index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } + /* store remaining data in buffer */ + while(index < msglen) + { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } - /* if non-integral number of bytes have been supplied, store + /* if non-integral number of bytes have been supplied, store remaining bits in last byte, together with information about number of bits */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } + if(rem) + { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } } #define BILB ctx->bits_in_last_byte @@ -273,80 +289,92 @@ static void Update(groestlHashState* ctx, /* finalise: process remaining data (including padding), perform output transformation, and write hash result to 'output' */ static void Final(groestlHashState* ctx, - BitSequence* output) { - int i, j = 0, hashbytelen = HASH_BIT_LEN/8; - uint8_t *s = (BitSequence*)ctx->chaining; - - /* pad with '1'-bit and first few '0'-bits */ - if (BILB) { - ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB); - BILB = 0; - } - else ctx->buffer[(int)ctx->buf_ptr++] = 0x80; - - /* pad with '0'-bits */ - if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) { - /* padding requires two blocks */ - while (ctx->buf_ptr < SIZE512) { - ctx->buffer[(int)ctx->buf_ptr++] = 0; + BitSequence* output) +{ + int i, j = 0, hashbytelen = HASH_BIT_LEN / 8; + uint8_t* s = (BitSequence*)ctx->chaining; + + /* pad with '1'-bit and first few '0'-bits */ + if(BILB) + { + ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << BILB) - 1) << (8 - BILB); + ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - BILB); + BILB = 0; + } + else + ctx->buffer[(int)ctx->buf_ptr++] = 0x80; + + /* pad with '0'-bits */ + if(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN) + { + /* padding requires two blocks */ + while(ctx->buf_ptr < SIZE512) + { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + /* digest first padding block */ + Transform(ctx, ctx->buffer, SIZE512); + ctx->buf_ptr = 0; } - /* digest first padding block */ + while(ctx->buf_ptr < SIZE512 - LENGTHFIELDLEN) + { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + + /* length padding */ + ctx->block_counter1++; + if(ctx->block_counter1 == 0) + ctx->block_counter2++; + ctx->buf_ptr = SIZE512; + + while(ctx->buf_ptr > SIZE512 - (int)sizeof(uint32_t)) + { + ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1; + ctx->block_counter1 >>= 8; + } + while(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN) + { + ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2; + ctx->block_counter2 >>= 8; + } + /* digest final padding block */ Transform(ctx, ctx->buffer, SIZE512); - ctx->buf_ptr = 0; - } - while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) { - ctx->buffer[(int)ctx->buf_ptr++] = 0; - } - - /* length padding */ - ctx->block_counter1++; - if (ctx->block_counter1 == 0) ctx->block_counter2++; - ctx->buf_ptr = SIZE512; - - while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) { - ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1; - ctx->block_counter1 >>= 8; - } - while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) { - ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2; - ctx->block_counter2 >>= 8; - } - /* digest final padding block */ - Transform(ctx, ctx->buffer, SIZE512); - /* perform output transformation */ - OutputTransformation(ctx); - - /* store hash result in output */ - for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) { - output[j] = s[i]; - } - - /* zeroise relevant variables and deallocate memory */ - for (i = 0; i < COLS512; i++) { - ctx->chaining[i] = 0; - } - for (i = 0; i < SIZE512; i++) { - ctx->buffer[i] = 0; - } + /* perform output transformation */ + OutputTransformation(ctx); + + /* store hash result in output */ + for(i = SIZE512 - hashbytelen; i < SIZE512; i++, j++) + { + output[j] = s[i]; + } + + /* zeroise relevant variables and deallocate memory */ + for(i = 0; i < COLS512; i++) + { + ctx->chaining[i] = 0; + } + for(i = 0; i < SIZE512; i++) + { + ctx->buffer[i] = 0; + } } /* hash bit sequence */ void groestl(const BitSequence* data, - DataLength databitlen, - BitSequence* hashval) { + DataLength databitlen, + BitSequence* hashval) +{ - groestlHashState context; + groestlHashState context; - /* initialise */ + /* initialise */ Init(&context); + /* process message */ + Update(&context, data, databitlen); - /* process message */ - Update(&context, data, databitlen); - - /* finalise */ - Final(&context, hashval); + /* finalise */ + Final(&context, hashval); } /* static int crypto_hash(unsigned char *out, diff --git a/xmrstak/backend/cpu/crypto/c_groestl.h b/xmrstak/backend/cpu/crypto/c_groestl.h index 47044b462..5322a2e2e 100644 --- a/xmrstak/backend/cpu/crypto/c_groestl.h +++ b/xmrstak/backend/cpu/crypto/c_groestl.h @@ -1,10 +1,10 @@ #ifndef __hash_h #define __hash_h /* -#include "crypto_uint8.h" +#include "crypto_hash.h" #include "crypto_uint32.h" #include "crypto_uint64.h" -#include "crypto_hash.h" +#include "crypto_uint8.h" typedef crypto_uint8 uint8_t; typedef crypto_uint32 uint32_t; @@ -19,29 +19,28 @@ typedef crypto_uint64 uint64_t; #define LENGTHFIELDLEN ROWS #define COLS512 8 -#define SIZE512 (ROWS*COLS512) +#define SIZE512 (ROWS * COLS512) #define ROUNDS512 10 #define HASH_BIT_LEN 256 -#define ROTL32(v, n) ((((v)<<(n))|((v)>>(32-(n))))&li_32(ffffffff)) - +#define ROTL32(v, n) ((((v) << (n)) | ((v) >> (32 - (n)))) & li_32(ffffffff)) #define li_32(h) 0x##h##u -#define EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n))) -#define u32BIG(a) \ - ((ROTL32(a,8) & li_32(00FF00FF)) | \ - (ROTL32(a,24) & li_32(FF00FF00))) - +#define EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n))) +#define u32BIG(a) \ + ((ROTL32(a, 8) & li_32(00FF00FF)) | \ + (ROTL32(a, 24) & li_32(FF00FF00))) /* NIST API begin */ -typedef struct { - uint32_t chaining[SIZE512/sizeof(uint32_t)]; /* actual state */ - uint32_t block_counter1, - block_counter2; /* message block counter(s) */ - BitSequence buffer[SIZE512]; /* data buffer */ - int buf_ptr; /* data buffer pointer */ - int bits_in_last_byte; /* no. of message bits in last byte of +typedef struct +{ + uint32_t chaining[SIZE512 / sizeof(uint32_t)]; /* actual state */ + uint32_t block_counter1, + block_counter2; /* message block counter(s) */ + BitSequence buffer[SIZE512]; /* data buffer */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of data buffer */ } groestlHashState; diff --git a/xmrstak/backend/cpu/crypto/c_jh.c b/xmrstak/backend/cpu/crypto/c_jh.c index 0256a0fa2..e50886dee 100644 --- a/xmrstak/backend/cpu/crypto/c_jh.c +++ b/xmrstak/backend/cpu/crypto/c_jh.c @@ -23,345 +23,400 @@ typedef uint64_t uint64; /*define data alignment for different C compilers*/ #if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) +#define DATA_ALIGN16(x) x __attribute__((aligned(16))) #else - #define DATA_ALIGN16(x) __declspec(align(16)) x +#define DATA_ALIGN16(x) __declspec(align(16)) x #endif - -typedef struct { - int hashbitlen; /*the message digest size*/ - unsigned long long databitlen; /*the message size in bits*/ - unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ - DATA_ALIGN16(uint64 x[8][2]); /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/ - unsigned char buffer[64]; /*the 512-bit message block to be hashed;*/ +typedef struct +{ + int hashbitlen; /*the message digest size*/ + unsigned long long databitlen; /*the message size in bits*/ + unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ + DATA_ALIGN16(uint64 x[8][2]); /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/ + unsigned char buffer[64]; /*the 512-bit message block to be hashed;*/ } hashState; - /*The initial hash value H(0)*/ -const unsigned char JH224_H0[128]={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e}; -const unsigned char JH256_H0[128]={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69}; -const unsigned char JH384_H0[128]={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f}; -const unsigned char JH512_H0[128]={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b}; +const unsigned char JH224_H0[128] = {0x2d, 0xfe, 0xdd, 0x62, 0xf9, 0x9a, 0x98, 0xac, 0xae, 0x7c, 0xac, 0xd6, 0x19, 0xd6, 0x34, 0xe7, 0xa4, 0x83, 0x10, 0x5, 0xbc, 0x30, 0x12, 0x16, 0xb8, 0x60, 0x38, 0xc6, 0xc9, 0x66, 0x14, 0x94, 0x66, 0xd9, 0x89, 0x9f, 0x25, 0x80, 0x70, 0x6f, 0xce, 0x9e, 0xa3, 0x1b, 0x1d, 0x9b, 0x1a, 0xdc, 0x11, 0xe8, 0x32, 0x5f, 0x7b, 0x36, 0x6e, 0x10, 0xf9, 0x94, 0x85, 0x7f, 0x2, 0xfa, 0x6, 0xc1, 0x1b, 0x4f, 0x1b, 0x5c, 0xd8, 0xc8, 0x40, 0xb3, 0x97, 0xf6, 0xa1, 0x7f, 0x6e, 0x73, 0x80, 0x99, 0xdc, 0xdf, 0x93, 0xa5, 0xad, 0xea, 0xa3, 0xd3, 0xa4, 0x31, 0xe8, 0xde, 0xc9, 0x53, 0x9a, 0x68, 0x22, 0xb4, 0xa9, 0x8a, 0xec, 0x86, 0xa1, 0xe4, 0xd5, 0x74, 0xac, 0x95, 0x9c, 0xe5, 0x6c, 0xf0, 0x15, 0x96, 0xd, 0xea, 0xb5, 0xab, 0x2b, 0xbf, 0x96, 0x11, 0xdc, 0xf0, 0xdd, 0x64, 0xea, 0x6e}; +const unsigned char JH256_H0[128] = {0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69}; +const unsigned char JH384_H0[128] = {0x48, 0x1e, 0x3b, 0xc6, 0xd8, 0x13, 0x39, 0x8a, 0x6d, 0x3b, 0x5e, 0x89, 0x4a, 0xde, 0x87, 0x9b, 0x63, 0xfa, 0xea, 0x68, 0xd4, 0x80, 0xad, 0x2e, 0x33, 0x2c, 0xcb, 0x21, 0x48, 0xf, 0x82, 0x67, 0x98, 0xae, 0xc8, 0x4d, 0x90, 0x82, 0xb9, 0x28, 0xd4, 0x55, 0xea, 0x30, 0x41, 0x11, 0x42, 0x49, 0x36, 0xf5, 0x55, 0xb2, 0x92, 0x48, 0x47, 0xec, 0xc7, 0x25, 0xa, 0x93, 0xba, 0xf4, 0x3c, 0xe1, 0x56, 0x9b, 0x7f, 0x8a, 0x27, 0xdb, 0x45, 0x4c, 0x9e, 0xfc, 0xbd, 0x49, 0x63, 0x97, 0xaf, 0xe, 0x58, 0x9f, 0xc2, 0x7d, 0x26, 0xaa, 0x80, 0xcd, 0x80, 0xc0, 0x8b, 0x8c, 0x9d, 0xeb, 0x2e, 0xda, 0x8a, 0x79, 0x81, 0xe8, 0xf8, 0xd5, 0x37, 0x3a, 0xf4, 0x39, 0x67, 0xad, 0xdd, 0xd1, 0x7a, 0x71, 0xa9, 0xb4, 0xd3, 0xbd, 0xa4, 0x75, 0xd3, 0x94, 0x97, 0x6c, 0x3f, 0xba, 0x98, 0x42, 0x73, 0x7f}; +const unsigned char JH512_H0[128] = {0x6f, 0xd1, 0x4b, 0x96, 0x3e, 0x0, 0xaa, 0x17, 0x63, 0x6a, 0x2e, 0x5, 0x7a, 0x15, 0xd5, 0x43, 0x8a, 0x22, 0x5e, 0x8d, 0xc, 0x97, 0xef, 0xb, 0xe9, 0x34, 0x12, 0x59, 0xf2, 0xb3, 0xc3, 0x61, 0x89, 0x1d, 0xa0, 0xc1, 0x53, 0x6f, 0x80, 0x1e, 0x2a, 0xa9, 0x5, 0x6b, 0xea, 0x2b, 0x6d, 0x80, 0x58, 0x8e, 0xcc, 0xdb, 0x20, 0x75, 0xba, 0xa6, 0xa9, 0xf, 0x3a, 0x76, 0xba, 0xf8, 0x3b, 0xf7, 0x1, 0x69, 0xe6, 0x5, 0x41, 0xe3, 0x4a, 0x69, 0x46, 0xb5, 0x8a, 0x8e, 0x2e, 0x6f, 0xe6, 0x5a, 0x10, 0x47, 0xa7, 0xd0, 0xc1, 0x84, 0x3c, 0x24, 0x3b, 0x6e, 0x71, 0xb1, 0x2d, 0x5a, 0xc1, 0x99, 0xcf, 0x57, 0xf6, 0xec, 0x9d, 0xb1, 0xf8, 0x56, 0xa7, 0x6, 0x88, 0x7c, 0x57, 0x16, 0xb1, 0x56, 0xe3, 0xc2, 0xfc, 0xdf, 0xe6, 0x85, 0x17, 0xfb, 0x54, 0x5a, 0x46, 0x78, 0xcc, 0x8c, 0xdd, 0x4b}; /*42 round constants, each round constant is 32-byte (256-bit)*/ -const unsigned char E8_bitslice_roundconstant[42][32]={ -{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40}, -{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31}, -{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc}, -{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3}, -{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23}, -{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97}, -{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14}, -{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4}, -{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36}, -{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f}, -{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b}, -{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62}, -{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5}, -{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f}, -{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a}, -{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf}, -{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0}, -{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a}, -{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6}, -{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67}, -{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18}, -{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e}, -{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1}, -{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83}, -{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef}, -{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65}, -{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c}, -{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71}, -{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0}, -{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f}, -{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad}, -{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6}, -{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63}, -{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f}, -{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a}, -{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5}, -{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48}, -{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e}, -{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7}, -{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde}, -{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a}, -{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; - - -static void E8(hashState *state); /*The bijective function E8, in bitslice form*/ -static void F8(hashState *state); /*The compression function F8 */ +const unsigned char E8_bitslice_roundconstant[42][32] = { + {0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40}, + {0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31}, + {0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc}, + {0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3}, + {0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23}, + {0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97}, + {0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14}, + {0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4}, + {0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36}, + {0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f}, + {0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b}, + {0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62}, + {0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5}, + {0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f}, + {0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a}, + {0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf}, + {0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0}, + {0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a}, + {0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6}, + {0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67}, + {0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18}, + {0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e}, + {0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1}, + {0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83}, + {0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef}, + {0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65}, + {0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c}, + {0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71}, + {0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0}, + {0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f}, + {0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad}, + {0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6}, + {0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63}, + {0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f}, + {0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a}, + {0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5}, + {0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48}, + {0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e}, + {0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7}, + {0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde}, + {0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a}, + {0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}}; + +static void E8(hashState* state); /*The bijective function E8, in bitslice form*/ +static void F8(hashState* state); /*The compression function F8 */ /*The API functions*/ -static HashReturn Init(hashState *state, int hashbitlen); -static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen); -static HashReturn Final(hashState *state, BitSequence *hashval); -HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval); +static HashReturn Init(hashState* state, int hashbitlen); +static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen); +static HashReturn Final(hashState* state, BitSequence* hashval); +HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval); /*swapping bit 2i with bit 2i+1 of 64-bit x*/ -#define SWAP1(x) (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1)); +#define SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1)); /*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 64-bit x*/ -#define SWAP2(x) (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2)); +#define SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2)); /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 64-bit x*/ -#define SWAP4(x) (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4)); +#define SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4)); /*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 64-bit x*/ -#define SWAP8(x) (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8)); +#define SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8)); /*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 64-bit x*/ -#define SWAP16(x) (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16)); +#define SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16)); /*swapping bits 64i||64i+1||......||64i+31 with bits 64i+32||64i+33||......||64i+63 of 64-bit x*/ -#define SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); +#define SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); /*The MDS transform*/ -#define L(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) ^= (m1); \ - (m5) ^= (m2); \ - (m6) ^= (m0) ^ (m3); \ - (m7) ^= (m0); \ - (m0) ^= (m5); \ - (m1) ^= (m6); \ - (m2) ^= (m4) ^ (m7); \ - (m3) ^= (m4); +#define L(m0, m1, m2, m3, m4, m5, m6, m7) \ + (m4) ^= (m1); \ + (m5) ^= (m2); \ + (m6) ^= (m0) ^ (m3); \ + (m7) ^= (m0); \ + (m0) ^= (m5); \ + (m1) ^= (m6); \ + (m2) ^= (m4) ^ (m7); \ + (m3) ^= (m4); /*Two Sboxes are computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/ /*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power*/ -#define SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1) \ - m3 = ~(m3); \ - m7 = ~(m7); \ - m0 ^= ((~(m2)) & (cc0)); \ - m4 ^= ((~(m6)) & (cc1)); \ - temp0 = (cc0) ^ ((m0) & (m1));\ - temp1 = (cc1) ^ ((m4) & (m5));\ - m0 ^= ((m2) & (m3)); \ - m4 ^= ((m6) & (m7)); \ - m3 ^= ((~(m1)) & (m2)); \ - m7 ^= ((~(m5)) & (m6)); \ - m1 ^= ((m0) & (m2)); \ - m5 ^= ((m4) & (m6)); \ - m2 ^= ((m0) & (~(m3))); \ - m6 ^= ((m4) & (~(m7))); \ - m0 ^= ((m1) | (m3)); \ - m4 ^= ((m5) | (m7)); \ - m3 ^= ((m1) & (m2)); \ - m7 ^= ((m5) & (m6)); \ - m1 ^= (temp0 & (m0)); \ - m5 ^= (temp1 & (m4)); \ - m2 ^= temp0; \ - m6 ^= temp1; +#define SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \ + m3 = ~(m3); \ + m7 = ~(m7); \ + m0 ^= ((~(m2)) & (cc0)); \ + m4 ^= ((~(m6)) & (cc1)); \ + temp0 = (cc0) ^ ((m0) & (m1)); \ + temp1 = (cc1) ^ ((m4) & (m5)); \ + m0 ^= ((m2) & (m3)); \ + m4 ^= ((m6) & (m7)); \ + m3 ^= ((~(m1)) & (m2)); \ + m7 ^= ((~(m5)) & (m6)); \ + m1 ^= ((m0) & (m2)); \ + m5 ^= ((m4) & (m6)); \ + m2 ^= ((m0) & (~(m3))); \ + m6 ^= ((m4) & (~(m7))); \ + m0 ^= ((m1) | (m3)); \ + m4 ^= ((m5) | (m7)); \ + m3 ^= ((m1) & (m2)); \ + m7 ^= ((m5) & (m6)); \ + m1 ^= (temp0 & (m0)); \ + m5 ^= (temp1 & (m4)); \ + m2 ^= temp0; \ + m6 ^= temp1; /*The bijective function E8, in bitslice form*/ -static void E8(hashState *state) +static void E8(hashState* state) { - uint64 i,roundnumber,temp0,temp1; - - for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) { - /*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP1(state->x[1][i]); SWAP1(state->x[3][i]); SWAP1(state->x[5][i]); SWAP1(state->x[7][i]); - } - - /*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP2(state->x[1][i]); SWAP2(state->x[3][i]); SWAP2(state->x[5][i]); SWAP2(state->x[7][i]); - } - - /*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP4(state->x[1][i]); SWAP4(state->x[3][i]); SWAP4(state->x[5][i]); SWAP4(state->x[7][i]); - } - - /*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP8(state->x[1][i]); SWAP8(state->x[3][i]); SWAP8(state->x[5][i]); SWAP8(state->x[7][i]); - } - - /*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP16(state->x[1][i]); SWAP16(state->x[3][i]); SWAP16(state->x[5][i]); SWAP16(state->x[7][i]); - } - - /*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP32(state->x[1][i]); SWAP32(state->x[3][i]); SWAP32(state->x[5][i]); SWAP32(state->x[7][i]); - } - - /*round 7*roundnumber+6: Sbox and MDS layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - } - /*round 7*roundnumber+6: swapping layer*/ - for (i = 1; i < 8; i = i+2) { - temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0; - } - } - + uint64 i, roundnumber, temp0, temp1; + + for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7) + { + /*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP1(state->x[1][i]); + SWAP1(state->x[3][i]); + SWAP1(state->x[5][i]); + SWAP1(state->x[7][i]); + } + + /*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP2(state->x[1][i]); + SWAP2(state->x[3][i]); + SWAP2(state->x[5][i]); + SWAP2(state->x[7][i]); + } + + /*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP4(state->x[1][i]); + SWAP4(state->x[3][i]); + SWAP4(state->x[5][i]); + SWAP4(state->x[7][i]); + } + + /*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP8(state->x[1][i]); + SWAP8(state->x[3][i]); + SWAP8(state->x[5][i]); + SWAP8(state->x[7][i]); + } + + /*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP16(state->x[1][i]); + SWAP16(state->x[3][i]); + SWAP16(state->x[5][i]); + SWAP16(state->x[7][i]); + } + + /*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP32(state->x[1][i]); + SWAP32(state->x[3][i]); + SWAP32(state->x[5][i]); + SWAP32(state->x[7][i]); + } + + /*round 7*roundnumber+6: Sbox and MDS layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + } + /*round 7*roundnumber+6: swapping layer*/ + for(i = 1; i < 8; i = i + 2) + { + temp0 = state->x[i][0]; + state->x[i][0] = state->x[i][1]; + state->x[i][1] = temp0; + } + } } /*The compression function F8 */ -static void F8(hashState *state) +static void F8(hashState* state) { - uint64 i; + uint64 i; - /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ - for (i = 0; i < 8; i++) state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i]; + /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ + for(i = 0; i < 8; i++) + state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i]; - /*the bijective function E8 */ - E8(state); + /*the bijective function E8 */ + E8(state); - /*xor the 512-bit message with the second half of the 1024-bit hash state*/ - for (i = 0; i < 8; i++) state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64*)state->buffer)[i]; + /*xor the 512-bit message with the second half of the 1024-bit hash state*/ + for(i = 0; i < 8; i++) + state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64*)state->buffer)[i]; } /*before hashing a message, initialize the hash state as H0 */ -static HashReturn Init(hashState *state, int hashbitlen) +static HashReturn Init(hashState* state, int hashbitlen) { - state->databitlen = 0; - state->datasize_in_buffer = 0; - - /*initialize the initial hash value of JH*/ - state->hashbitlen = hashbitlen; - - /*load the initial hash value into state*/ - switch (hashbitlen) - { - case 224: memcpy(state->x,JH224_H0,128); break; - case 256: memcpy(state->x,JH256_H0,128); break; - case 384: memcpy(state->x,JH384_H0,128); break; - case 512: memcpy(state->x,JH512_H0,128); break; - } - - return(SUCCESS); + state->databitlen = 0; + state->datasize_in_buffer = 0; + + /*initialize the initial hash value of JH*/ + state->hashbitlen = hashbitlen; + + /*load the initial hash value into state*/ + switch(hashbitlen) + { + case 224: + memcpy(state->x, JH224_H0, 128); + break; + case 256: + memcpy(state->x, JH256_H0, 128); + break; + case 384: + memcpy(state->x, JH384_H0, 128); + break; + case 512: + memcpy(state->x, JH512_H0, 128); + break; + } + + return (SUCCESS); } - /*hash each 512-bit message block, except the last partial block*/ -static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) +static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen) { - DataLength index; /*the starting address of the data to be compressed*/ - - state->databitlen += databitlen; - index = 0; - - /*if there is remaining data in the buffer, fill it to a full message block first*/ - /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/ - - /*There is data in the buffer, but the incoming data is insufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) { - if ( (databitlen & 7) == 0 ) { - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ; - } - else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ; - state->datasize_in_buffer += databitlen; - databitlen = 0; - } - - /*There is data in the buffer, and the incoming data is sufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ; - index = 64-(state->datasize_in_buffer >> 3); - databitlen = databitlen - (512 - state->datasize_in_buffer); - F8(state); - state->datasize_in_buffer = 0; - } - - /*hash the remaining full message blocks*/ - for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) { - memcpy(state->buffer, data+index, 64); - F8(state); - } - - /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/ - if ( databitlen > 0) { - if ((databitlen & 7) == 0) - memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3); - else - memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1); - state->datasize_in_buffer = databitlen; - } - - return(SUCCESS); + DataLength index; /*the starting address of the data to be compressed*/ + + state->databitlen += databitlen; + index = 0; + + /*if there is remaining data in the buffer, fill it to a full message block first*/ + /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/ + + /*There is data in the buffer, but the incoming data is insufficient for a full block*/ + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512)) + { + if((databitlen & 7) == 0) + { + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); + } + else + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1); + state->datasize_in_buffer += databitlen; + databitlen = 0; + } + + /*There is data in the buffer, and the incoming data is sufficient for a full block*/ + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512)) + { + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); + index = 64 - (state->datasize_in_buffer >> 3); + databitlen = databitlen - (512 - state->datasize_in_buffer); + F8(state); + state->datasize_in_buffer = 0; + } + + /*hash the remaining full message blocks*/ + for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512) + { + memcpy(state->buffer, data + index, 64); + F8(state); + } + + /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/ + if(databitlen > 0) + { + if((databitlen & 7) == 0) + memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3); + else + memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1); + state->datasize_in_buffer = databitlen; + } + + return (SUCCESS); } /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/ -static HashReturn Final(hashState *state, BitSequence *hashval) +static HashReturn Final(hashState* state, BitSequence* hashval) { - unsigned int i; - - if ( (state->databitlen & 0x1ff) == 0 ) { - /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ - memset(state->buffer, 0, 64); - state->buffer[0] = 0x80; - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - else { - /*set the rest of the bytes in the buffer to 0*/ - if ( (state->datasize_in_buffer & 7) == 0) - for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; - else - for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0; - - /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ - state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7)); - - F8(state); - memset(state->buffer, 0, 64); - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - - /*truncating the final hash value to generate the message digest*/ - switch(state->hashbitlen) { - case 224: memcpy(hashval,(unsigned char*)state->x+64+36,28); break; - case 256: memcpy(hashval,(unsigned char*)state->x+64+32,32); break; - case 384: memcpy(hashval,(unsigned char*)state->x+64+16,48); break; - case 512: memcpy(hashval,(unsigned char*)state->x+64,64); break; - } - - return(SUCCESS); + unsigned int i; + + if((state->databitlen & 0x1ff) == 0) + { + /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ + memset(state->buffer, 0, 64); + state->buffer[0] = 0x80; + state->buffer[63] = state->databitlen & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[61] = (state->databitlen >> 16) & 0xff; + state->buffer[60] = (state->databitlen >> 24) & 0xff; + state->buffer[59] = (state->databitlen >> 32) & 0xff; + state->buffer[58] = (state->databitlen >> 40) & 0xff; + state->buffer[57] = (state->databitlen >> 48) & 0xff; + state->buffer[56] = (state->databitlen >> 56) & 0xff; + F8(state); + } + else + { + /*set the rest of the bytes in the buffer to 0*/ + if((state->datasize_in_buffer & 7) == 0) + for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) + state->buffer[i] = 0; + else + for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++) + state->buffer[i] = 0; + + /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ + state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7)); + + F8(state); + memset(state->buffer, 0, 64); + state->buffer[63] = state->databitlen & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[61] = (state->databitlen >> 16) & 0xff; + state->buffer[60] = (state->databitlen >> 24) & 0xff; + state->buffer[59] = (state->databitlen >> 32) & 0xff; + state->buffer[58] = (state->databitlen >> 40) & 0xff; + state->buffer[57] = (state->databitlen >> 48) & 0xff; + state->buffer[56] = (state->databitlen >> 56) & 0xff; + F8(state); + } + + /*truncating the final hash value to generate the message digest*/ + switch(state->hashbitlen) + { + case 224: + memcpy(hashval, (unsigned char*)state->x + 64 + 36, 28); + break; + case 256: + memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32); + break; + case 384: + memcpy(hashval, (unsigned char*)state->x + 64 + 16, 48); + break; + case 512: + memcpy(hashval, (unsigned char*)state->x + 64, 64); + break; + } + + return (SUCCESS); } /* hash a message, three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen) one output: message digest (hashval) */ -HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval) +HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval) { - hashState state; - - if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) { - Init(&state, hashbitlen); - Update(&state, data, databitlen); - Final(&state, hashval); - return SUCCESS; - } - else - return(BAD_HASHLEN); + hashState state; + + if(hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512) + { + Init(&state, hashbitlen); + Update(&state, data, databitlen); + Final(&state, hashval); + return SUCCESS; + } + else + return (BAD_HASHLEN); } diff --git a/xmrstak/backend/cpu/crypto/c_jh.h b/xmrstak/backend/cpu/crypto/c_jh.h index d10d40fe5..34d30e6b4 100644 --- a/xmrstak/backend/cpu/crypto/c_jh.h +++ b/xmrstak/backend/cpu/crypto/c_jh.h @@ -16,4 +16,4 @@ #include "hash.h" -HashReturn jh_hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval); +HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval); diff --git a/xmrstak/backend/cpu/crypto/c_keccak.c b/xmrstak/backend/cpu/crypto/c_keccak.c index 63c16147d..0af6b02ef 100644 --- a/xmrstak/backend/cpu/crypto/c_keccak.c +++ b/xmrstak/backend/cpu/crypto/c_keccak.c @@ -2,8 +2,8 @@ // 19-Nov-11 Markku-Juhani O. Saarinen // A baseline Keccak (3rd round) implementation. -#include #include +#include #define HASH_DATA_AREA 136 #define KECCAK_ROUNDS 24 @@ -13,16 +13,15 @@ #endif const uint64_t keccakf_rndc[24] = -{ - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 -}; + { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008}; // update the state with given number of rounds @@ -31,7 +30,8 @@ void keccakf(uint64_t st[25], int rounds) int i, j, round; uint64_t t, bc[5]; - for (round = 0; round < rounds; ++round) { + for(round = 0; round < rounds; ++round) + { // Theta bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; @@ -40,10 +40,11 @@ void keccakf(uint64_t st[25], int rounds) bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; - for (i = 0; i < 5; ++i) { + for(i = 0; i < 5; ++i) + { t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); - st[i ] ^= t; - st[i + 5] ^= t; + st[i] ^= t; + st[i + 5] ^= t; st[i + 10] ^= t; st[i + 15] ^= t; st[i + 20] ^= t; @@ -51,81 +52,81 @@ void keccakf(uint64_t st[25], int rounds) // Rho Pi t = st[1]; - st[ 1] = ROTL64(st[ 6], 44); - st[ 6] = ROTL64(st[ 9], 20); - st[ 9] = ROTL64(st[22], 61); + st[1] = ROTL64(st[6], 44); + st[6] = ROTL64(st[9], 20); + st[9] = ROTL64(st[22], 61); st[22] = ROTL64(st[14], 39); st[14] = ROTL64(st[20], 18); - st[20] = ROTL64(st[ 2], 62); - st[ 2] = ROTL64(st[12], 43); + st[20] = ROTL64(st[2], 62); + st[2] = ROTL64(st[12], 43); st[12] = ROTL64(st[13], 25); - st[13] = ROTL64(st[19], 8); + st[13] = ROTL64(st[19], 8); st[19] = ROTL64(st[23], 56); st[23] = ROTL64(st[15], 41); - st[15] = ROTL64(st[ 4], 27); - st[ 4] = ROTL64(st[24], 14); - st[24] = ROTL64(st[21], 2); - st[21] = ROTL64(st[ 8], 55); - st[ 8] = ROTL64(st[16], 45); - st[16] = ROTL64(st[ 5], 36); - st[ 5] = ROTL64(st[ 3], 28); - st[ 3] = ROTL64(st[18], 21); + st[15] = ROTL64(st[4], 27); + st[4] = ROTL64(st[24], 14); + st[24] = ROTL64(st[21], 2); + st[21] = ROTL64(st[8], 55); + st[8] = ROTL64(st[16], 45); + st[16] = ROTL64(st[5], 36); + st[5] = ROTL64(st[3], 28); + st[3] = ROTL64(st[18], 21); st[18] = ROTL64(st[17], 15); st[17] = ROTL64(st[11], 10); - st[11] = ROTL64(st[ 7], 6); - st[ 7] = ROTL64(st[10], 3); + st[11] = ROTL64(st[7], 6); + st[7] = ROTL64(st[10], 3); st[10] = ROTL64(t, 1); // Chi // unrolled loop, where only last iteration is different j = 0; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 5; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 10; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 15; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 20; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; bc[2] = st[j + 2]; bc[3] = st[j + 3]; bc[4] = st[j + 4]; - st[j ] ^= (~bc[1]) & bc[2]; + st[j] ^= (~bc[1]) & bc[2]; st[j + 1] ^= (~bc[2]) & bc[3]; st[j + 2] ^= (~bc[3]) & bc[4]; st[j + 3] ^= (~bc[4]) & bc[0]; @@ -139,7 +140,7 @@ void keccakf(uint64_t st[25], int rounds) // compute a keccak hash (md) of given byte length from "in" typedef uint64_t state_t[25]; -void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen) +void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen) { state_t st; uint8_t temp[144]; @@ -150,9 +151,10 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen) memset(st, 0, sizeof(st)); - for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) { - for (i = 0; i < rsizw; i++) - st[i] ^= ((uint64_t *) in)[i]; + for(; inlen >= rsiz; inlen -= rsiz, in += rsiz) + { + for(i = 0; i < rsizw; i++) + st[i] ^= ((uint64_t*)in)[i]; keccakf(st, KECCAK_ROUNDS); } @@ -162,15 +164,15 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen) memset(temp + inlen, 0, rsiz - inlen); temp[rsiz - 1] |= 0x80; - for (i = 0; i < rsizw; i++) - st[i] ^= ((uint64_t *) temp)[i]; + for(i = 0; i < rsizw; i++) + st[i] ^= ((uint64_t*)temp)[i]; keccakf(st, KECCAK_ROUNDS); memcpy(md, st, mdlen); } -void keccak1600(const uint8_t *in, int inlen, uint8_t *md) +void keccak1600(const uint8_t* in, int inlen, uint8_t* md) { keccak(in, inlen, md, sizeof(state_t)); } diff --git a/xmrstak/backend/cpu/crypto/c_keccak.h b/xmrstak/backend/cpu/crypto/c_keccak.h index 4f7f85729..b7a26065e 100644 --- a/xmrstak/backend/cpu/crypto/c_keccak.h +++ b/xmrstak/backend/cpu/crypto/c_keccak.h @@ -16,11 +16,11 @@ #endif // compute a keccak hash (md) of given byte length from "in" -int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen); +int keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen); // update the state void keccakf(uint64_t st[25], int norounds); -void keccak1600(const uint8_t *in, int inlen, uint8_t *md); +void keccak1600(const uint8_t* in, int inlen, uint8_t* md); #endif diff --git a/xmrstak/backend/cpu/crypto/c_skein.c b/xmrstak/backend/cpu/crypto/c_skein.c index e2d54425f..4b8cbb388 100644 --- a/xmrstak/backend/cpu/crypto/c_skein.c +++ b/xmrstak/backend/cpu/crypto/c_skein.c @@ -8,11 +8,11 @@ ** ************************************************************************/ -#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ +#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ -#include /* get size_t definition */ -#include /* get the memcpy/memset functions */ -#include "c_skein.h" /* get the Skein API definitions */ +#include "c_skein.h" /* get the Skein API definitions */ +#include /* get size_t definition */ +#include /* get the memcpy/memset functions */ #define DISABLE_UNUSED 0 @@ -24,72 +24,72 @@ #define SKEIN_512_NIST_MAX_HASHBITS (512) #endif -#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ +#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */ -#define SKEIN_256_STATE_WORDS ( 4) -#define SKEIN_512_STATE_WORDS ( 8) -#define SKEIN1024_STATE_WORDS (16) -#define SKEIN_MAX_STATE_WORDS (16) +#define SKEIN_256_STATE_WORDS (4) +#define SKEIN_512_STATE_WORDS (8) +#define SKEIN1024_STATE_WORDS (16) +#define SKEIN_MAX_STATE_WORDS (16) -#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_RND_SPECIAL (1000u) -#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL+0u) -#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL+1u) -#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL+2u) +#define SKEIN_RND_SPECIAL (1000u) +#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL + 0u) +#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL + 1u) +#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL + 2u) typedef struct { - size_t hashBitLen; /* size of hash result, in bits */ - size_t bCnt; /* current byte count in buffer b[] */ - u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ } Skein_Ctxt_Hdr_t; -typedef struct /* 256-bit Skein hash context structure */ +typedef struct /* 256-bit Skein hash context structure */ { - Skein_Ctxt_Hdr_t h; /* common header context variables */ - u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ } Skein_256_Ctxt_t; -typedef struct /* 512-bit Skein hash context structure */ +typedef struct /* 512-bit Skein hash context structure */ { - Skein_Ctxt_Hdr_t h; /* common header context variables */ - u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ } Skein_512_Ctxt_t; -typedef struct /* 1024-bit Skein hash context structure */ +typedef struct /* 1024-bit Skein hash context structure */ { - Skein_Ctxt_Hdr_t h; /* common header context variables */ - u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ } Skein1024_Ctxt_t; /* Skein APIs for (incremental) "straight hashing" */ #if SKEIN_256_NIST_MAX_HASH_BITS -static int Skein_256_Init (Skein_256_Ctxt_t *ctx, size_t hashBitLen); +static int Skein_256_Init(Skein_256_Ctxt_t* ctx, size_t hashBitLen); #endif -static int Skein_512_Init (Skein_512_Ctxt_t *ctx, size_t hashBitLen); -static int Skein1024_Init (Skein1024_Ctxt_t *ctx, size_t hashBitLen); +static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen); +static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen); -static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); -static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); -static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt); +static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt); +static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt); -static int Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); -static int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); -static int Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); +static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal); +static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal); +static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal); /* ** Skein APIs for "extended" initialization: MAC keys, tree hashing. @@ -126,7 +126,7 @@ static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal); #define SKEIN_TREE_HASH (1) #endif #if 0 -#if SKEIN_TREE_HASH +#if SKEIN_TREE_HASH static int Skein_256_Output (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); static int Skein_512_Output (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); static int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); @@ -142,128 +142,146 @@ static int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); ******************************************************************/ /* tweak word T[1]: bit field starting positions */ -#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ +#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word */ -#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */ -#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ -#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ -#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ -#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ +#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ /* tweak word T[1]: flag bit definition(s) */ -#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST) -#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL) -#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t) 1 ) << SKEIN_T1_POS_BIT_PAD) +#define SKEIN_T1_FLAG_FIRST (((u64b_t)1) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((u64b_t)1) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t)1) << SKEIN_T1_POS_BIT_PAD) /* tweak word T[1]: tree level bit field mask */ -#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL) -#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t)(n)) << SKEIN_T1_POS_TREE_LVL) /* tweak word T[1]: block type field */ -#define SKEIN_BLK_TYPE_KEY ( 0) /* key, for MAC and KDF */ -#define SKEIN_BLK_TYPE_CFG ( 4) /* configuration block */ -#define SKEIN_BLK_TYPE_PERS ( 8) /* personalization string */ -#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */ -#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ -#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ -#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ -#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ -#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ - -#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) -#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */ -#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */ -#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */ -#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */ -#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */ -#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */ -#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ -#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ -#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */ - -#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) -#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) - -#define SKEIN_VERSION (1) - -#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ -#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/ -#endif - -#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32)) -#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE) -#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) - -#define SKEIN_CFG_STR_LEN (4*8) +#define SKEIN_BLK_TYPE_KEY (0) /* key, for MAC and KDF */ +#define SKEIN_BLK_TYPE_CFG (4) /* configuration block */ +#define SKEIN_BLK_TYPE_PERS (8) /* personalization string */ +#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */ +#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ +#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ + +#define SKEIN_T1_BLK_TYPE(T) (((u64b_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */ +#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */ +#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */ +#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */ +#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */ +#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE) /* nonce for PRNG */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ +#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */ + +#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_VERSION (1) + +#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ +#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/ +#endif + +#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((u64b_t)(hi32)) << 32)) +#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22) + +#define SKEIN_CFG_STR_LEN (4 * 8) /* bit field definitions in config block treeInfo word */ -#define SKEIN_CFG_TREE_LEAF_SIZE_POS ( 0) -#define SKEIN_CFG_TREE_NODE_SIZE_POS ( 8) -#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) +#define SKEIN_CFG_TREE_LEAF_SIZE_POS (0) +#define SKEIN_CFG_TREE_NODE_SIZE_POS (8) +#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) -#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) -#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) -#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) +#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) +#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) +#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) -#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl) \ - ( (((u64b_t)(leaf )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ - (((u64b_t)(node )) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ - (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) ) +#define SKEIN_CFG_TREE_INFO(leaf, node, maxLvl) \ + ((((u64b_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ + (((u64b_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ + (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS)) -#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */ +#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0, 0, 0) /* use as treeInfo in InitExt() call for sequential processing */ /* ** Skein macros for getting/setting tweak words, etc. ** These are useful for partial input bytes, hash tree init/update, etc. **/ -#define Skein_Get_Tweak(ctxPtr,TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) -#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} +#define Skein_Get_Tweak(ctxPtr, TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) +#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \ + { \ + (ctxPtr)->h.T[TWK_NUM] = (tVal); \ + } -#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr,0) -#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr,1) -#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) -#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) +#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0) +#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr, 1) +#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0) +#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1) /* set both tweak words at once */ -#define Skein_Set_T0_T1(ctxPtr,T0,T1) \ -{ \ - Skein_Set_T0(ctxPtr,(T0)); \ - Skein_Set_T1(ctxPtr,(T1)); \ -} +#define Skein_Set_T0_T1(ctxPtr, T0, T1) \ + { \ + Skein_Set_T0(ctxPtr, (T0)); \ + Skein_Set_T1(ctxPtr, (T1)); \ + } -#define Skein_Set_Type(ctxPtr,BLK_TYPE) \ - Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE) +#define Skein_Set_Type(ctxPtr, BLK_TYPE) \ + Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE) /* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */ -#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ -{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } +#define Skein_Start_New_Type(ctxPtr, BLK_TYPE) \ + { \ + Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \ + (ctxPtr)->h.bCnt = 0; \ + } -#define Skein_Clear_First_Flag(hdr) { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; } -#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } +#define Skein_Clear_First_Flag(hdr) \ + { \ + (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \ + } +#define Skein_Set_Bit_Pad_Flag(hdr) \ + { \ + (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \ + } -#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);} +#define Skein_Set_Tree_Level(hdr, height) \ + { \ + (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \ + } /***************************************************************** ** "Internal" Skein definitions for debugging and error checking ******************************************************************/ -#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr) -#define Skein_Show_Round(bits,ctx,r,X) -#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr) -#define Skein_Show_Final(bits,ctx,cnt,outPtr) -#define Skein_Show_Key(bits,ctx,key,keyBytes) - - -#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */ -#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */ +#define Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr) +#define Skein_Show_Round(bits, ctx, r, X) +#define Skein_Show_R_Ptr(bits, ctx, r, X_ptr) +#define Skein_Show_Final(bits, ctx, cnt, outPtr) +#define Skein_Show_Key(bits, ctx, key, keyBytes) + +#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */ +#define Skein_Assert(x, retCode) /* default: ignore all Asserts, for performance */ #define Skein_assert(x) -#elif defined(SKEIN_ASSERT) +#elif defined(SKEIN_ASSERT) #include -#define Skein_Assert(x,retCode) assert(x) -#define Skein_assert(x) assert(x) +#define Skein_Assert(x, retCode) assert(x) +#define Skein_assert(x) assert(x) #else #include -#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */ -#define Skein_assert(x) assert(x) /* internal error */ +#define Skein_Assert(x, retCode) \ + { \ + if(!(x)) \ + return retCode; \ + } /* caller error */ +#define Skein_assert(x) assert(x) /* internal error */ #endif /***************************************************************** @@ -271,48 +289,135 @@ static int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); ******************************************************************/ enum { - /* Skein_256 round rotation constants */ - R_256_0_0=14, R_256_0_1=16, - R_256_1_0=52, R_256_1_1=57, - R_256_2_0=23, R_256_2_1=40, - R_256_3_0= 5, R_256_3_1=37, - R_256_4_0=25, R_256_4_1=33, - R_256_5_0=46, R_256_5_1=12, - R_256_6_0=58, R_256_6_1=22, - R_256_7_0=32, R_256_7_1=32, - - /* Skein_512 round rotation constants */ - R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, - R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, - R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, - R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, - R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, - R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, - R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, - R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22, - - /* Skein1024 round rotation constants */ - R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37, - R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52, - R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17, - R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25, - R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30, - R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41, - R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25, - R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20 + /* Skein_256 round rotation constants */ + R_256_0_0 = 14, + R_256_0_1 = 16, + R_256_1_0 = 52, + R_256_1_1 = 57, + R_256_2_0 = 23, + R_256_2_1 = 40, + R_256_3_0 = 5, + R_256_3_1 = 37, + R_256_4_0 = 25, + R_256_4_1 = 33, + R_256_5_0 = 46, + R_256_5_1 = 12, + R_256_6_0 = 58, + R_256_6_1 = 22, + R_256_7_0 = 32, + R_256_7_1 = 32, + + /* Skein_512 round rotation constants */ + R_512_0_0 = 46, + R_512_0_1 = 36, + R_512_0_2 = 19, + R_512_0_3 = 37, + R_512_1_0 = 33, + R_512_1_1 = 27, + R_512_1_2 = 14, + R_512_1_3 = 42, + R_512_2_0 = 17, + R_512_2_1 = 49, + R_512_2_2 = 36, + R_512_2_3 = 39, + R_512_3_0 = 44, + R_512_3_1 = 9, + R_512_3_2 = 54, + R_512_3_3 = 56, + R_512_4_0 = 39, + R_512_4_1 = 30, + R_512_4_2 = 34, + R_512_4_3 = 24, + R_512_5_0 = 13, + R_512_5_1 = 50, + R_512_5_2 = 10, + R_512_5_3 = 17, + R_512_6_0 = 25, + R_512_6_1 = 29, + R_512_6_2 = 39, + R_512_6_3 = 43, + R_512_7_0 = 8, + R_512_7_1 = 35, + R_512_7_2 = 56, + R_512_7_3 = 22, + + /* Skein1024 round rotation constants */ + R1024_0_0 = 24, + R1024_0_1 = 13, + R1024_0_2 = 8, + R1024_0_3 = 47, + R1024_0_4 = 8, + R1024_0_5 = 17, + R1024_0_6 = 22, + R1024_0_7 = 37, + R1024_1_0 = 38, + R1024_1_1 = 19, + R1024_1_2 = 10, + R1024_1_3 = 55, + R1024_1_4 = 49, + R1024_1_5 = 18, + R1024_1_6 = 23, + R1024_1_7 = 52, + R1024_2_0 = 33, + R1024_2_1 = 4, + R1024_2_2 = 51, + R1024_2_3 = 13, + R1024_2_4 = 34, + R1024_2_5 = 41, + R1024_2_6 = 59, + R1024_2_7 = 17, + R1024_3_0 = 5, + R1024_3_1 = 20, + R1024_3_2 = 48, + R1024_3_3 = 41, + R1024_3_4 = 47, + R1024_3_5 = 28, + R1024_3_6 = 16, + R1024_3_7 = 25, + R1024_4_0 = 41, + R1024_4_1 = 9, + R1024_4_2 = 37, + R1024_4_3 = 31, + R1024_4_4 = 12, + R1024_4_5 = 47, + R1024_4_6 = 44, + R1024_4_7 = 30, + R1024_5_0 = 16, + R1024_5_1 = 34, + R1024_5_2 = 56, + R1024_5_3 = 51, + R1024_5_4 = 4, + R1024_5_5 = 53, + R1024_5_6 = 42, + R1024_5_7 = 41, + R1024_6_0 = 31, + R1024_6_1 = 44, + R1024_6_2 = 47, + R1024_6_3 = 46, + R1024_6_4 = 19, + R1024_6_5 = 42, + R1024_6_6 = 44, + R1024_6_7 = 25, + R1024_7_0 = 9, + R1024_7_1 = 48, + R1024_7_2 = 35, + R1024_7_3 = 52, + R1024_7_4 = 23, + R1024_7_5 = 31, + R1024_7_6 = 37, + R1024_7_7 = 20 }; #ifndef SKEIN_ROUNDS -#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */ +#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */ #define SKEIN_512_ROUNDS_TOTAL (72) #define SKEIN1024_ROUNDS_TOTAL (80) -#else /* allow command-line define in range 8*(5..14) */ -#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5)) -#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5)) -#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)) +#else /* allow command-line define in range 8*(5..14) */ +#define SKEIN_256_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)) +#define SKEIN_512_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)) +#define SKEIN1024_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS) + 5) % 10) + 5)) #endif - /* ***************** Pre-computed Skein IVs ******************* ** @@ -332,239 +437,233 @@ enum /* blkSize = 256 bits. hashSize = 128 bits */ const u64b_t SKEIN_256_IV_128[] = { - MK_64(0xE1111906,0x964D7260), - MK_64(0x883DAAA7,0x7C8D811C), - MK_64(0x10080DF4,0x91960F7A), - MK_64(0xCCF7DDE5,0xB45BC1C2) - }; + MK_64(0xE1111906, 0x964D7260), + MK_64(0x883DAAA7, 0x7C8D811C), + MK_64(0x10080DF4, 0x91960F7A), + MK_64(0xCCF7DDE5, 0xB45BC1C2)}; /* blkSize = 256 bits. hashSize = 160 bits */ const u64b_t SKEIN_256_IV_160[] = { - MK_64(0x14202314,0x72825E98), - MK_64(0x2AC4E9A2,0x5A77E590), - MK_64(0xD47A5856,0x8838D63E), - MK_64(0x2DD2E496,0x8586AB7D) - }; + MK_64(0x14202314, 0x72825E98), + MK_64(0x2AC4E9A2, 0x5A77E590), + MK_64(0xD47A5856, 0x8838D63E), + MK_64(0x2DD2E496, 0x8586AB7D)}; /* blkSize = 256 bits. hashSize = 224 bits */ const u64b_t SKEIN_256_IV_224[] = { - MK_64(0xC6098A8C,0x9AE5EA0B), - MK_64(0x876D5686,0x08C5191C), - MK_64(0x99CB88D7,0xD7F53884), - MK_64(0x384BDDB1,0xAEDDB5DE) - }; + MK_64(0xC6098A8C, 0x9AE5EA0B), + MK_64(0x876D5686, 0x08C5191C), + MK_64(0x99CB88D7, 0xD7F53884), + MK_64(0x384BDDB1, 0xAEDDB5DE)}; /* blkSize = 256 bits. hashSize = 256 bits */ const u64b_t SKEIN_256_IV_256[] = { - MK_64(0xFC9DA860,0xD048B449), - MK_64(0x2FCA6647,0x9FA7D833), - MK_64(0xB33BC389,0x6656840F), - MK_64(0x6A54E920,0xFDE8DA69) - }; + MK_64(0xFC9DA860, 0xD048B449), + MK_64(0x2FCA6647, 0x9FA7D833), + MK_64(0xB33BC389, 0x6656840F), + MK_64(0x6A54E920, 0xFDE8DA69)}; /* blkSize = 512 bits. hashSize = 128 bits */ const u64b_t SKEIN_512_IV_128[] = { - MK_64(0xA8BC7BF3,0x6FBF9F52), - MK_64(0x1E9872CE,0xBD1AF0AA), - MK_64(0x309B1790,0xB32190D3), - MK_64(0xBCFBB854,0x3F94805C), - MK_64(0x0DA61BCD,0x6E31B11B), - MK_64(0x1A18EBEA,0xD46A32E3), - MK_64(0xA2CC5B18,0xCE84AA82), - MK_64(0x6982AB28,0x9D46982D) - }; + MK_64(0xA8BC7BF3, 0x6FBF9F52), + MK_64(0x1E9872CE, 0xBD1AF0AA), + MK_64(0x309B1790, 0xB32190D3), + MK_64(0xBCFBB854, 0x3F94805C), + MK_64(0x0DA61BCD, 0x6E31B11B), + MK_64(0x1A18EBEA, 0xD46A32E3), + MK_64(0xA2CC5B18, 0xCE84AA82), + MK_64(0x6982AB28, 0x9D46982D)}; /* blkSize = 512 bits. hashSize = 160 bits */ const u64b_t SKEIN_512_IV_160[] = { - MK_64(0x28B81A2A,0xE013BD91), - MK_64(0xC2F11668,0xB5BDF78F), - MK_64(0x1760D8F3,0xF6A56F12), - MK_64(0x4FB74758,0x8239904F), - MK_64(0x21EDE07F,0x7EAF5056), - MK_64(0xD908922E,0x63ED70B8), - MK_64(0xB8EC76FF,0xECCB52FA), - MK_64(0x01A47BB8,0xA3F27A6E) - }; + MK_64(0x28B81A2A, 0xE013BD91), + MK_64(0xC2F11668, 0xB5BDF78F), + MK_64(0x1760D8F3, 0xF6A56F12), + MK_64(0x4FB74758, 0x8239904F), + MK_64(0x21EDE07F, 0x7EAF5056), + MK_64(0xD908922E, 0x63ED70B8), + MK_64(0xB8EC76FF, 0xECCB52FA), + MK_64(0x01A47BB8, 0xA3F27A6E)}; /* blkSize = 512 bits. hashSize = 224 bits */ const u64b_t SKEIN_512_IV_224[] = { - MK_64(0xCCD06162,0x48677224), - MK_64(0xCBA65CF3,0xA92339EF), - MK_64(0x8CCD69D6,0x52FF4B64), - MK_64(0x398AED7B,0x3AB890B4), - MK_64(0x0F59D1B1,0x457D2BD0), - MK_64(0x6776FE65,0x75D4EB3D), - MK_64(0x99FBC70E,0x997413E9), - MK_64(0x9E2CFCCF,0xE1C41EF7) - }; + MK_64(0xCCD06162, 0x48677224), + MK_64(0xCBA65CF3, 0xA92339EF), + MK_64(0x8CCD69D6, 0x52FF4B64), + MK_64(0x398AED7B, 0x3AB890B4), + MK_64(0x0F59D1B1, 0x457D2BD0), + MK_64(0x6776FE65, 0x75D4EB3D), + MK_64(0x99FBC70E, 0x997413E9), + MK_64(0x9E2CFCCF, 0xE1C41EF7)}; /* blkSize = 512 bits. hashSize = 256 bits */ const u64b_t SKEIN_512_IV_256[] = { - MK_64(0xCCD044A1,0x2FDB3E13), - MK_64(0xE8359030,0x1A79A9EB), - MK_64(0x55AEA061,0x4F816E6F), - MK_64(0x2A2767A4,0xAE9B94DB), - MK_64(0xEC06025E,0x74DD7683), - MK_64(0xE7A436CD,0xC4746251), - MK_64(0xC36FBAF9,0x393AD185), - MK_64(0x3EEDBA18,0x33EDFC13) - }; + MK_64(0xCCD044A1, 0x2FDB3E13), + MK_64(0xE8359030, 0x1A79A9EB), + MK_64(0x55AEA061, 0x4F816E6F), + MK_64(0x2A2767A4, 0xAE9B94DB), + MK_64(0xEC06025E, 0x74DD7683), + MK_64(0xE7A436CD, 0xC4746251), + MK_64(0xC36FBAF9, 0x393AD185), + MK_64(0x3EEDBA18, 0x33EDFC13)}; /* blkSize = 512 bits. hashSize = 384 bits */ const u64b_t SKEIN_512_IV_384[] = { - MK_64(0xA3F6C6BF,0x3A75EF5F), - MK_64(0xB0FEF9CC,0xFD84FAA4), - MK_64(0x9D77DD66,0x3D770CFE), - MK_64(0xD798CBF3,0xB468FDDA), - MK_64(0x1BC4A666,0x8A0E4465), - MK_64(0x7ED7D434,0xE5807407), - MK_64(0x548FC1AC,0xD4EC44D6), - MK_64(0x266E1754,0x6AA18FF8) - }; + MK_64(0xA3F6C6BF, 0x3A75EF5F), + MK_64(0xB0FEF9CC, 0xFD84FAA4), + MK_64(0x9D77DD66, 0x3D770CFE), + MK_64(0xD798CBF3, 0xB468FDDA), + MK_64(0x1BC4A666, 0x8A0E4465), + MK_64(0x7ED7D434, 0xE5807407), + MK_64(0x548FC1AC, 0xD4EC44D6), + MK_64(0x266E1754, 0x6AA18FF8)}; /* blkSize = 512 bits. hashSize = 512 bits */ const u64b_t SKEIN_512_IV_512[] = { - MK_64(0x4903ADFF,0x749C51CE), - MK_64(0x0D95DE39,0x9746DF03), - MK_64(0x8FD19341,0x27C79BCE), - MK_64(0x9A255629,0xFF352CB1), - MK_64(0x5DB62599,0xDF6CA7B0), - MK_64(0xEABE394C,0xA9D5C3F4), - MK_64(0x991112C7,0x1A75B523), - MK_64(0xAE18A40B,0x660FCC33) - }; + MK_64(0x4903ADFF, 0x749C51CE), + MK_64(0x0D95DE39, 0x9746DF03), + MK_64(0x8FD19341, 0x27C79BCE), + MK_64(0x9A255629, 0xFF352CB1), + MK_64(0x5DB62599, 0xDF6CA7B0), + MK_64(0xEABE394C, 0xA9D5C3F4), + MK_64(0x991112C7, 0x1A75B523), + MK_64(0xAE18A40B, 0x660FCC33)}; /* blkSize = 1024 bits. hashSize = 384 bits */ const u64b_t SKEIN1024_IV_384[] = { - MK_64(0x5102B6B8,0xC1894A35), - MK_64(0xFEEBC9E3,0xFE8AF11A), - MK_64(0x0C807F06,0xE32BED71), - MK_64(0x60C13A52,0xB41A91F6), - MK_64(0x9716D35D,0xD4917C38), - MK_64(0xE780DF12,0x6FD31D3A), - MK_64(0x797846B6,0xC898303A), - MK_64(0xB172C2A8,0xB3572A3B), - MK_64(0xC9BC8203,0xA6104A6C), - MK_64(0x65909338,0xD75624F4), - MK_64(0x94BCC568,0x4B3F81A0), - MK_64(0x3EBBF51E,0x10ECFD46), - MK_64(0x2DF50F0B,0xEEB08542), - MK_64(0x3B5A6530,0x0DBC6516), - MK_64(0x484B9CD2,0x167BBCE1), - MK_64(0x2D136947,0xD4CBAFEA) - }; + MK_64(0x5102B6B8, 0xC1894A35), + MK_64(0xFEEBC9E3, 0xFE8AF11A), + MK_64(0x0C807F06, 0xE32BED71), + MK_64(0x60C13A52, 0xB41A91F6), + MK_64(0x9716D35D, 0xD4917C38), + MK_64(0xE780DF12, 0x6FD31D3A), + MK_64(0x797846B6, 0xC898303A), + MK_64(0xB172C2A8, 0xB3572A3B), + MK_64(0xC9BC8203, 0xA6104A6C), + MK_64(0x65909338, 0xD75624F4), + MK_64(0x94BCC568, 0x4B3F81A0), + MK_64(0x3EBBF51E, 0x10ECFD46), + MK_64(0x2DF50F0B, 0xEEB08542), + MK_64(0x3B5A6530, 0x0DBC6516), + MK_64(0x484B9CD2, 0x167BBCE1), + MK_64(0x2D136947, 0xD4CBAFEA)}; /* blkSize = 1024 bits. hashSize = 512 bits */ const u64b_t SKEIN1024_IV_512[] = { - MK_64(0xCAEC0E5D,0x7C1B1B18), - MK_64(0xA01B0E04,0x5F03E802), - MK_64(0x33840451,0xED912885), - MK_64(0x374AFB04,0xEAEC2E1C), - MK_64(0xDF25A0E2,0x813581F7), - MK_64(0xE4004093,0x8B12F9D2), - MK_64(0xA662D539,0xC2ED39B6), - MK_64(0xFA8B85CF,0x45D8C75A), - MK_64(0x8316ED8E,0x29EDE796), - MK_64(0x053289C0,0x2E9F91B8), - MK_64(0xC3F8EF1D,0x6D518B73), - MK_64(0xBDCEC3C4,0xD5EF332E), - MK_64(0x549A7E52,0x22974487), - MK_64(0x67070872,0x5B749816), - MK_64(0xB9CD28FB,0xF0581BD1), - MK_64(0x0E2940B8,0x15804974) - }; + MK_64(0xCAEC0E5D, 0x7C1B1B18), + MK_64(0xA01B0E04, 0x5F03E802), + MK_64(0x33840451, 0xED912885), + MK_64(0x374AFB04, 0xEAEC2E1C), + MK_64(0xDF25A0E2, 0x813581F7), + MK_64(0xE4004093, 0x8B12F9D2), + MK_64(0xA662D539, 0xC2ED39B6), + MK_64(0xFA8B85CF, 0x45D8C75A), + MK_64(0x8316ED8E, 0x29EDE796), + MK_64(0x053289C0, 0x2E9F91B8), + MK_64(0xC3F8EF1D, 0x6D518B73), + MK_64(0xBDCEC3C4, 0xD5EF332E), + MK_64(0x549A7E52, 0x22974487), + MK_64(0x67070872, 0x5B749816), + MK_64(0xB9CD28FB, 0xF0581BD1), + MK_64(0x0E2940B8, 0x15804974)}; /* blkSize = 1024 bits. hashSize = 1024 bits */ const u64b_t SKEIN1024_IV_1024[] = { - MK_64(0xD593DA07,0x41E72355), - MK_64(0x15B5E511,0xAC73E00C), - MK_64(0x5180E5AE,0xBAF2C4F0), - MK_64(0x03BD41D3,0xFCBCAFAF), - MK_64(0x1CAEC6FD,0x1983A898), - MK_64(0x6E510B8B,0xCDD0589F), - MK_64(0x77E2BDFD,0xC6394ADA), - MK_64(0xC11E1DB5,0x24DCB0A3), - MK_64(0xD6D14AF9,0xC6329AB5), - MK_64(0x6A9B0BFC,0x6EB67E0D), - MK_64(0x9243C60D,0xCCFF1332), - MK_64(0x1A1F1DDE,0x743F02D4), - MK_64(0x0996753C,0x10ED0BB8), - MK_64(0x6572DD22,0xF2B4969A), - MK_64(0x61FD3062,0xD00A579A), - MK_64(0x1DE0536E,0x8682E539) - }; - + MK_64(0xD593DA07, 0x41E72355), + MK_64(0x15B5E511, 0xAC73E00C), + MK_64(0x5180E5AE, 0xBAF2C4F0), + MK_64(0x03BD41D3, 0xFCBCAFAF), + MK_64(0x1CAEC6FD, 0x1983A898), + MK_64(0x6E510B8B, 0xCDD0589F), + MK_64(0x77E2BDFD, 0xC6394ADA), + MK_64(0xC11E1DB5, 0x24DCB0A3), + MK_64(0xD6D14AF9, 0xC6329AB5), + MK_64(0x6A9B0BFC, 0x6EB67E0D), + MK_64(0x9243C60D, 0xCCFF1332), + MK_64(0x1A1F1DDE, 0x743F02D4), + MK_64(0x0996753C, 0x10ED0BB8), + MK_64(0x6572DD22, 0xF2B4969A), + MK_64(0x61FD3062, 0xD00A579A), + MK_64(0x1DE0536E, 0x8682E539)}; #ifndef SKEIN_USE_ASM -#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ +#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ #endif #ifndef SKEIN_LOOP -#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ +#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ #endif -#define BLK_BITS (WCNT*64) /* some useful definitions for code here */ -#define KW_TWK_BASE (0) -#define KW_KEY_BASE (3) -#define ks (kw + KW_KEY_BASE) -#define ts (kw + KW_TWK_BASE) +#define BLK_BITS (WCNT * 64) /* some useful definitions for code here */ +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) #ifdef SKEIN_DEBUG -#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } +#define DebugSaveTweak(ctx) \ + { \ + ctx->h.T[0] = ts[0]; \ + ctx->h.T[1] = ts[1]; \ + } #else #define DebugSaveTweak(ctx) #endif /***************************** Skein_256 ******************************/ #if !(SKEIN_USE_ASM & 256) -static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) - { /* do it in C */ +static void Skein_256_Process_Block(Skein_256_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd) +{ /* do it in C */ enum - { + { WCNT = SKEIN_256_STATE_WORDS - }; -#undef RCNT -#define RCNT (SKEIN_256_ROUNDS_TOTAL/8) + }; +#undef RCNT +#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) -#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ -#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10) +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) #else #define SKEIN_UNROLL_256 (0) #endif #if SKEIN_UNROLL_256 -#if (RCNT % SKEIN_UNROLL_256) +#if(RCNT % SKEIN_UNROLL_256) #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ #endif - size_t r; - u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ + size_t r; + u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/ #else - u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif - u64b_t X0,X1,X2,X3; /* local copy of context vars, for speed */ - u64b_t w [WCNT]; /* local copy of input block */ + u64b_t X0, X1, X2, X3; /* local copy of context vars, for speed */ + u64b_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG - const u64b_t *Xptr[4]; /* use for debugging (help compiler put Xn in registers) */ - Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; + const u64b_t* Xptr[4]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; #endif - Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; - do { + do + { /* this implementation only supports 2**64 input bytes (no carry out here) */ - ts[0] += byteCntAdd; /* update processed length */ + ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; @@ -575,114 +674,118 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s ts[2] = ts[0] ^ ts[1]; - Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */ DebugSaveTweak(ctx); - Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); - X0 = w[0] + ks[0]; /* do the first full key injection */ + X0 = w[0] + ks[0]; /* do the first full key injection */ X1 = w[1] + ks[1] + ts[0]; X2 = w[2] + ks[2] + ts[1]; X3 = w[3] + ks[3]; - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); /* show starting state values */ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* show starting state values */ blkPtr += SKEIN_256_BLOCK_BYTES; /* run the rounds */ -#define Round256(p0,p1,p2,p3,ROT,rNum) \ - X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ +#define Round256(p0, p1, p2, p3, ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = RotL_64(X##p1, ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = RotL_64(X##p3, ROT##_1); \ + X##p3 ^= X##p2; #if SKEIN_UNROLL_256 == 0 -#define R256(p0,p1,p2,p3,ROT,rNum) /* fully unrolled */ \ - Round256(p0,p1,p2,p3,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); - -#define I256(R) \ - X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \ - X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \ - X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \ - X3 += ks[((R)+4) % 5] + (R)+1; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); -#else /* looping version */ -#define R256(p0,p1,p2,p3,ROT,rNum) \ - Round256(p0,p1,p2,p3,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); - -#define I256(R) \ - X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ - X1 += ks[r+(R)+1] + ts[r+(R)+0]; \ - X2 += ks[r+(R)+2] + ts[r+(R)+1]; \ - X3 += ks[r+(R)+3] + r+(R) ; \ - ks[r + (R)+4 ] = ks[r+(R)-1]; /* rotate key schedule */\ - ts[r + (R)+2 ] = ts[r+(R)-1]; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); - - for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256) /* loop thru it */ +#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I256(R) \ + X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ + X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ + X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ + X3 += ks[((R) + 4) % 5] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R256(p0, p1, p2, p3, ROT, rNum) \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I256(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \ + X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \ + X3 += ks[r + (R) + 3] + r + (R); \ + ks[r + (R) + 4] = ks[r + (R)-1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) /* loop thru it */ #endif { -#define R256_8_rounds(R) \ - R256(0,1,2,3,R_256_0,8*(R) + 1); \ - R256(0,3,2,1,R_256_1,8*(R) + 2); \ - R256(0,1,2,3,R_256_2,8*(R) + 3); \ - R256(0,3,2,1,R_256_3,8*(R) + 4); \ - I256(2*(R)); \ - R256(0,1,2,3,R_256_4,8*(R) + 5); \ - R256(0,3,2,1,R_256_5,8*(R) + 6); \ - R256(0,1,2,3,R_256_6,8*(R) + 7); \ - R256(0,3,2,1,R_256_7,8*(R) + 8); \ - I256(2*(R)+1); - - R256_8_rounds( 0); - -#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN))) - - #if R256_Unroll_R( 1) - R256_8_rounds( 1); - #endif - #if R256_Unroll_R( 2) - R256_8_rounds( 2); - #endif - #if R256_Unroll_R( 3) - R256_8_rounds( 3); - #endif - #if R256_Unroll_R( 4) - R256_8_rounds( 4); - #endif - #if R256_Unroll_R( 5) - R256_8_rounds( 5); - #endif - #if R256_Unroll_R( 6) - R256_8_rounds( 6); - #endif - #if R256_Unroll_R( 7) - R256_8_rounds( 7); - #endif - #if R256_Unroll_R( 8) - R256_8_rounds( 8); - #endif - #if R256_Unroll_R( 9) - R256_8_rounds( 9); - #endif - #if R256_Unroll_R(10) - R256_8_rounds(10); - #endif - #if R256_Unroll_R(11) - R256_8_rounds(11); - #endif - #if R256_Unroll_R(12) - R256_8_rounds(12); - #endif - #if R256_Unroll_R(13) - R256_8_rounds(13); - #endif - #if R256_Unroll_R(14) - R256_8_rounds(14); - #endif - #if (SKEIN_UNROLL_256 > 14) -#error "need more unrolling in Skein_256_Process_Block" - #endif +#define R256_8_rounds(R) \ + R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ + R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ + R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ + R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ + I256(2 * (R)); \ + R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ + R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ + R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ + R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ + I256(2 * (R) + 1); + + R256_8_rounds(0); + +#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_256 > (NN))) + +#if R256_Unroll_R(1) + R256_8_rounds(1); +#endif +#if R256_Unroll_R(2) + R256_8_rounds(2); +#endif +#if R256_Unroll_R(3) + R256_8_rounds(3); +#endif +#if R256_Unroll_R(4) + R256_8_rounds(4); +#endif +#if R256_Unroll_R(5) + R256_8_rounds(5); +#endif +#if R256_Unroll_R(6) + R256_8_rounds(6); +#endif +#if R256_Unroll_R(7) + R256_8_rounds(7); +#endif +#if R256_Unroll_R(8) + R256_8_rounds(8); +#endif +#if R256_Unroll_R(9) + R256_8_rounds(9); +#endif +#if R256_Unroll_R(10) + R256_8_rounds(10); +#endif +#if R256_Unroll_R(11) + R256_8_rounds(11); +#endif +#if R256_Unroll_R(12) + R256_8_rounds(12); +#endif +#if R256_Unroll_R(13) + R256_8_rounds(13); +#endif +#if R256_Unroll_R(14) + R256_8_rounds(14); +#endif +#if(SKEIN_UNROLL_256 > 14) +#error "need more unrolling in Skein_256_Process_Block" +#endif } /* do the final "feedforward" xor, update context chaining vars */ ctx->X[0] = X0 ^ w[0]; @@ -690,68 +793,74 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s ctx->X[2] = X2 ^ w[2]; ctx->X[3] = X3 ^ w[3]; - Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; - } +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_256_Process_Block_CodeSize(void) - { - return ((u08b_t *) Skein_256_Process_Block_CodeSize) - - ((u08b_t *) Skein_256_Process_Block); - } +{ + return ((u08b_t*)Skein_256_Process_Block_CodeSize) - + ((u08b_t*)Skein_256_Process_Block); +} static uint_t Skein_256_Unroll_Cnt(void) - { +{ return SKEIN_UNROLL_256; - } +} #endif #endif /***************************** Skein_512 ******************************/ #if !(SKEIN_USE_ASM & 512) -static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) - { /* do it in C */ +static void Skein_512_Process_Block(Skein_512_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd) +{ /* do it in C */ enum - { + { WCNT = SKEIN_512_STATE_WORDS - }; -#undef RCNT -#define RCNT (SKEIN_512_ROUNDS_TOTAL/8) + }; +#undef RCNT +#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8) -#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ -#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10) +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10) #else #define SKEIN_UNROLL_512 (0) #endif #if SKEIN_UNROLL_512 -#if (RCNT % SKEIN_UNROLL_512) +#if(RCNT % SKEIN_UNROLL_512) #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ #endif - size_t r; - u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ + size_t r; + u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/ #else - u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif - u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ - u64b_t w [WCNT]; /* local copy of input block */ + u64b_t X0, X1, X2, X3, X4, X5, X6, X7; /* local copy of vars, for speed */ + u64b_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG - const u64b_t *Xptr[8]; /* use for debugging (help compiler put Xn in registers) */ - Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; - Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7; + const u64b_t* Xptr[8]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; + Xptr[4] = &X4; + Xptr[5] = &X5; + Xptr[6] = &X6; + Xptr[7] = &X7; #endif - Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; - do { + do + { /* this implementation only supports 2**64 input bytes (no carry out here) */ - ts[0] += byteCntAdd; /* update processed length */ + ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; @@ -767,126 +876,134 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s ts[2] = ts[0] ^ ts[1]; - Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */ DebugSaveTweak(ctx); - Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); - X0 = w[0] + ks[0]; /* do the first full key injection */ - X1 = w[1] + ks[1]; - X2 = w[2] + ks[2]; - X3 = w[3] + ks[3]; - X4 = w[4] + ks[4]; - X5 = w[5] + ks[5] + ts[0]; - X6 = w[6] + ks[6] + ts[1]; - X7 = w[7] + ks[7]; + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; blkPtr += SKEIN_512_BLOCK_BYTES; - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* run the rounds */ -#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ - X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ - X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ +#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = RotL_64(X##p1, ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = RotL_64(X##p3, ROT##_1); \ + X##p3 ^= X##p2; \ + X##p4 += X##p5; \ + X##p5 = RotL_64(X##p5, ROT##_2); \ + X##p5 ^= X##p4; \ + X##p6 += X##p7; \ + X##p7 = RotL_64(X##p7, ROT##_3); \ + X##p7 ^= X##p6; #if SKEIN_UNROLL_512 == 0 -#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \ - Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); - -#define I512(R) \ - X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \ - X1 += ks[((R)+2) % 9]; \ - X2 += ks[((R)+3) % 9]; \ - X3 += ks[((R)+4) % 9]; \ - X4 += ks[((R)+5) % 9]; \ - X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ - X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ - X7 += ks[((R)+8) % 9] + (R)+1; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); -#else /* looping version */ -#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); - -#define I512(R) \ - X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ - X1 += ks[r+(R)+1]; \ - X2 += ks[r+(R)+2]; \ - X3 += ks[r+(R)+3]; \ - X4 += ks[r+(R)+4]; \ - X5 += ks[r+(R)+5] + ts[r+(R)+0]; \ - X6 += ks[r+(R)+6] + ts[r+(R)+1]; \ - X7 += ks[r+(R)+7] + r+(R) ; \ - ks[r + (R)+8] = ks[r+(R)-1]; /* rotate key schedule */ \ - ts[r + (R)+2] = ts[r+(R)-1]; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); - - for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512) /* loop thru it */ -#endif /* end of looped code definitions */ +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I512(R) \ + X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */ \ + X1 += ks[((R) + 2) % 9]; \ + X2 += ks[((R) + 3) % 9]; \ + X3 += ks[((R) + 4) % 9]; \ + X4 += ks[((R) + 5) % 9]; \ + X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ + X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ + X7 += ks[((R) + 8) % 9] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I512(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1]; \ + X2 += ks[r + (R) + 2]; \ + X3 += ks[r + (R) + 3]; \ + X4 += ks[r + (R) + 4]; \ + X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ + X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ + X7 += ks[r + (R) + 7] + r + (R); \ + ks[r + (R) + 8] = ks[r + (R)-1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) /* loop thru it */ +#endif /* end of looped code definitions */ { -#define R512_8_rounds(R) /* do 8 full rounds */ \ - R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ - R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ - R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ - R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ - I512(2*(R)); \ - R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ - R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ - R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ - R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ - I512(2*(R)+1); /* and key injection */ - - R512_8_rounds( 0); - -#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN))) - - #if R512_Unroll_R( 1) - R512_8_rounds( 1); - #endif - #if R512_Unroll_R( 2) - R512_8_rounds( 2); - #endif - #if R512_Unroll_R( 3) - R512_8_rounds( 3); - #endif - #if R512_Unroll_R( 4) - R512_8_rounds( 4); - #endif - #if R512_Unroll_R( 5) - R512_8_rounds( 5); - #endif - #if R512_Unroll_R( 6) - R512_8_rounds( 6); - #endif - #if R512_Unroll_R( 7) - R512_8_rounds( 7); - #endif - #if R512_Unroll_R( 8) - R512_8_rounds( 8); - #endif - #if R512_Unroll_R( 9) - R512_8_rounds( 9); - #endif - #if R512_Unroll_R(10) - R512_8_rounds(10); - #endif - #if R512_Unroll_R(11) - R512_8_rounds(11); - #endif - #if R512_Unroll_R(12) - R512_8_rounds(12); - #endif - #if R512_Unroll_R(13) - R512_8_rounds(13); - #endif - #if R512_Unroll_R(14) - R512_8_rounds(14); - #endif - #if (SKEIN_UNROLL_512 > 14) -#error "need more unrolling in Skein_512_Process_Block" - #endif +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ + I512(2 * (R)); \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ + I512(2 * (R) + 1); /* and key injection */ + + R512_8_rounds(0); + +#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_512 > (NN))) + +#if R512_Unroll_R(1) + R512_8_rounds(1); +#endif +#if R512_Unroll_R(2) + R512_8_rounds(2); +#endif +#if R512_Unroll_R(3) + R512_8_rounds(3); +#endif +#if R512_Unroll_R(4) + R512_8_rounds(4); +#endif +#if R512_Unroll_R(5) + R512_8_rounds(5); +#endif +#if R512_Unroll_R(6) + R512_8_rounds(6); +#endif +#if R512_Unroll_R(7) + R512_8_rounds(7); +#endif +#if R512_Unroll_R(8) + R512_8_rounds(8); +#endif +#if R512_Unroll_R(9) + R512_8_rounds(9); +#endif +#if R512_Unroll_R(10) + R512_8_rounds(10); +#endif +#if R512_Unroll_R(11) + R512_8_rounds(11); +#endif +#if R512_Unroll_R(12) + R512_8_rounds(12); +#endif +#if R512_Unroll_R(13) + R512_8_rounds(13); +#endif +#if R512_Unroll_R(14) + R512_8_rounds(14); +#endif +#if(SKEIN_UNROLL_512 > 14) +#error "need more unrolling in Skein_512_Process_Block" +#endif } /* do the final "feedforward" xor, update context chaining vars */ @@ -898,256 +1015,284 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s ctx->X[5] = X5 ^ w[5]; ctx->X[6] = X6 ^ w[6]; ctx->X[7] = X7 ^ w[7]; - Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; - } +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_512_Process_Block_CodeSize(void) - { - return ((u08b_t *) Skein_512_Process_Block_CodeSize) - - ((u08b_t *) Skein_512_Process_Block); - } +{ + return ((u08b_t*)Skein_512_Process_Block_CodeSize) - + ((u08b_t*)Skein_512_Process_Block); +} static uint_t Skein_512_Unroll_Cnt(void) - { +{ return SKEIN_UNROLL_512; - } +} #endif #endif /***************************** Skein1024 ******************************/ #if !(SKEIN_USE_ASM & 1024) -static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) - { /* do it in C, always looping (unrolled is bigger AND slower!) */ +static void Skein1024_Process_Block(Skein1024_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd) +{ /* do it in C, always looping (unrolled is bigger AND slower!) */ enum - { + { WCNT = SKEIN1024_STATE_WORDS - }; -#undef RCNT -#define RCNT (SKEIN1024_ROUNDS_TOTAL/8) + }; +#undef RCNT +#define RCNT (SKEIN1024_ROUNDS_TOTAL / 8) -#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ -#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_1024 ((SKEIN_LOOP) % 10) #else #define SKEIN_UNROLL_1024 (0) #endif -#if (SKEIN_UNROLL_1024 != 0) -#if (RCNT % SKEIN_UNROLL_1024) +#if(SKEIN_UNROLL_1024 != 0) +#if(RCNT % SKEIN_UNROLL_1024) #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ #endif - size_t r; - u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ + size_t r; + u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/ #else - u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif - u64b_t X00,X01,X02,X03,X04,X05,X06,X07, /* local copy of vars, for speed */ - X08,X09,X10,X11,X12,X13,X14,X15; - u64b_t w [WCNT]; /* local copy of input block */ + u64b_t X00, X01, X02, X03, X04, X05, X06, X07, /* local copy of vars, for speed */ + X08, X09, X10, X11, X12, X13, X14, X15; + u64b_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG - const u64b_t *Xptr[16]; /* use for debugging (help compiler put Xn in registers) */ - Xptr[ 0] = &X00; Xptr[ 1] = &X01; Xptr[ 2] = &X02; Xptr[ 3] = &X03; - Xptr[ 4] = &X04; Xptr[ 5] = &X05; Xptr[ 6] = &X06; Xptr[ 7] = &X07; - Xptr[ 8] = &X08; Xptr[ 9] = &X09; Xptr[10] = &X10; Xptr[11] = &X11; - Xptr[12] = &X12; Xptr[13] = &X13; Xptr[14] = &X14; Xptr[15] = &X15; + const u64b_t* Xptr[16]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X00; + Xptr[1] = &X01; + Xptr[2] = &X02; + Xptr[3] = &X03; + Xptr[4] = &X04; + Xptr[5] = &X05; + Xptr[6] = &X06; + Xptr[7] = &X07; + Xptr[8] = &X08; + Xptr[9] = &X09; + Xptr[10] = &X10; + Xptr[11] = &X11; + Xptr[12] = &X12; + Xptr[13] = &X13; + Xptr[14] = &X14; + Xptr[15] = &X15; #endif - Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; - do { + do + { /* this implementation only supports 2**64 input bytes (no carry out here) */ - ts[0] += byteCntAdd; /* update processed length */ + ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ - ks[ 0] = ctx->X[ 0]; - ks[ 1] = ctx->X[ 1]; - ks[ 2] = ctx->X[ 2]; - ks[ 3] = ctx->X[ 3]; - ks[ 4] = ctx->X[ 4]; - ks[ 5] = ctx->X[ 5]; - ks[ 6] = ctx->X[ 6]; - ks[ 7] = ctx->X[ 7]; - ks[ 8] = ctx->X[ 8]; - ks[ 9] = ctx->X[ 9]; + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ctx->X[8]; + ks[9] = ctx->X[9]; ks[10] = ctx->X[10]; ks[11] = ctx->X[11]; ks[12] = ctx->X[12]; ks[13] = ctx->X[13]; ks[14] = ctx->X[14]; ks[15] = ctx->X[15]; - ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^ - ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^ - ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^ + ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ + ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; - ts[2] = ts[0] ^ ts[1]; + ts[2] = ts[0] ^ ts[1]; - Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */ DebugSaveTweak(ctx); - Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); - - X00 = w[ 0] + ks[ 0]; /* do the first full key injection */ - X01 = w[ 1] + ks[ 1]; - X02 = w[ 2] + ks[ 2]; - X03 = w[ 3] + ks[ 3]; - X04 = w[ 4] + ks[ 4]; - X05 = w[ 5] + ks[ 5]; - X06 = w[ 6] + ks[ 6]; - X07 = w[ 7] + ks[ 7]; - X08 = w[ 8] + ks[ 8]; - X09 = w[ 9] + ks[ 9]; - X10 = w[10] + ks[10]; - X11 = w[11] + ks[11]; - X12 = w[12] + ks[12]; - X13 = w[13] + ks[13] + ts[0]; - X14 = w[14] + ks[14] + ts[1]; - X15 = w[15] + ks[15]; - - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); - -#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \ - X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ - X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ - X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ - X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8; \ - X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA; \ - X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC; \ - X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE; \ + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X00 = w[0] + ks[0]; /* do the first full key injection */ + X01 = w[1] + ks[1]; + X02 = w[2] + ks[2]; + X03 = w[3] + ks[3]; + X04 = w[4] + ks[4]; + X05 = w[5] + ks[5]; + X06 = w[6] + ks[6]; + X07 = w[7] + ks[7]; + X08 = w[8] + ks[8]; + X09 = w[9] + ks[9]; + X10 = w[10] + ks[10]; + X11 = w[11] + ks[11]; + X12 = w[12] + ks[12]; + X13 = w[13] + ks[13] + ts[0]; + X14 = w[14] + ks[14] + ts[1]; + X15 = w[15] + ks[15]; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); + +#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = RotL_64(X##p1, ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = RotL_64(X##p3, ROT##_1); \ + X##p3 ^= X##p2; \ + X##p4 += X##p5; \ + X##p5 = RotL_64(X##p5, ROT##_2); \ + X##p5 ^= X##p4; \ + X##p6 += X##p7; \ + X##p7 = RotL_64(X##p7, ROT##_3); \ + X##p7 ^= X##p6; \ + X##p8 += X##p9; \ + X##p9 = RotL_64(X##p9, ROT##_4); \ + X##p9 ^= X##p8; \ + X##pA += X##pB; \ + X##pB = RotL_64(X##pB, ROT##_5); \ + X##pB ^= X##pA; \ + X##pC += X##pD; \ + X##pD = RotL_64(X##pD, ROT##_6); \ + X##pD ^= X##pC; \ + X##pE += X##pF; \ + X##pF = RotL_64(X##pF, ROT##_7); \ + X##pF ^= X##pE; #if SKEIN_UNROLL_1024 == 0 -#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr); - -#define I1024(R) \ - X00 += ks[((R)+ 1) % 17]; /* inject the key schedule value */ \ - X01 += ks[((R)+ 2) % 17]; \ - X02 += ks[((R)+ 3) % 17]; \ - X03 += ks[((R)+ 4) % 17]; \ - X04 += ks[((R)+ 5) % 17]; \ - X05 += ks[((R)+ 6) % 17]; \ - X06 += ks[((R)+ 7) % 17]; \ - X07 += ks[((R)+ 8) % 17]; \ - X08 += ks[((R)+ 9) % 17]; \ - X09 += ks[((R)+10) % 17]; \ - X10 += ks[((R)+11) % 17]; \ - X11 += ks[((R)+12) % 17]; \ - X12 += ks[((R)+13) % 17]; \ - X13 += ks[((R)+14) % 17] + ts[((R)+1) % 3]; \ - X14 += ks[((R)+15) % 17] + ts[((R)+2) % 3]; \ - X15 += ks[((R)+16) % 17] + (R)+1; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); -#else /* looping version */ -#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr); - -#define I1024(R) \ - X00 += ks[r+(R)+ 0]; /* inject the key schedule value */ \ - X01 += ks[r+(R)+ 1]; \ - X02 += ks[r+(R)+ 2]; \ - X03 += ks[r+(R)+ 3]; \ - X04 += ks[r+(R)+ 4]; \ - X05 += ks[r+(R)+ 5]; \ - X06 += ks[r+(R)+ 6]; \ - X07 += ks[r+(R)+ 7]; \ - X08 += ks[r+(R)+ 8]; \ - X09 += ks[r+(R)+ 9]; \ - X10 += ks[r+(R)+10]; \ - X11 += ks[r+(R)+11]; \ - X12 += ks[r+(R)+12]; \ - X13 += ks[r+(R)+13] + ts[r+(R)+0]; \ - X14 += ks[r+(R)+14] + ts[r+(R)+1]; \ - X15 += ks[r+(R)+15] + r+(R) ; \ - ks[r + (R)+16] = ks[r+(R)-1]; /* rotate key schedule */ \ - ts[r + (R)+ 2] = ts[r+(R)-1]; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); - - for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024) /* loop thru it */ +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr); + +#define I1024(R) \ + X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */ \ + X01 += ks[((R) + 2) % 17]; \ + X02 += ks[((R) + 3) % 17]; \ + X03 += ks[((R) + 4) % 17]; \ + X04 += ks[((R) + 5) % 17]; \ + X05 += ks[((R) + 6) % 17]; \ + X06 += ks[((R) + 7) % 17]; \ + X07 += ks[((R) + 8) % 17]; \ + X08 += ks[((R) + 9) % 17]; \ + X09 += ks[((R) + 10) % 17]; \ + X10 += ks[((R) + 11) % 17]; \ + X11 += ks[((R) + 12) % 17]; \ + X12 += ks[((R) + 13) % 17]; \ + X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ + X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ + X15 += ks[((R) + 16) % 17] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr); + +#define I1024(R) \ + X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X01 += ks[r + (R) + 1]; \ + X02 += ks[r + (R) + 2]; \ + X03 += ks[r + (R) + 3]; \ + X04 += ks[r + (R) + 4]; \ + X05 += ks[r + (R) + 5]; \ + X06 += ks[r + (R) + 6]; \ + X07 += ks[r + (R) + 7]; \ + X08 += ks[r + (R) + 8]; \ + X09 += ks[r + (R) + 9]; \ + X10 += ks[r + (R) + 10]; \ + X11 += ks[r + (R) + 11]; \ + X12 += ks[r + (R) + 12]; \ + X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ + X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ + X15 += ks[r + (R) + 15] + r + (R); \ + ks[r + (R) + 16] = ks[r + (R)-1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + for(r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) /* loop thru it */ #endif { -#define R1024_8_rounds(R) /* do 8 full rounds */ \ - R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \ - R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \ - R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \ - R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \ - I1024(2*(R)); \ - R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \ - R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \ - R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \ - R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \ - I1024(2*(R)+1); - - R1024_8_rounds( 0); - -#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN))) - - #if R1024_Unroll_R( 1) - R1024_8_rounds( 1); - #endif - #if R1024_Unroll_R( 2) - R1024_8_rounds( 2); - #endif - #if R1024_Unroll_R( 3) - R1024_8_rounds( 3); - #endif - #if R1024_Unroll_R( 4) - R1024_8_rounds( 4); - #endif - #if R1024_Unroll_R( 5) - R1024_8_rounds( 5); - #endif - #if R1024_Unroll_R( 6) - R1024_8_rounds( 6); - #endif - #if R1024_Unroll_R( 7) - R1024_8_rounds( 7); - #endif - #if R1024_Unroll_R( 8) - R1024_8_rounds( 8); - #endif - #if R1024_Unroll_R( 9) - R1024_8_rounds( 9); - #endif - #if R1024_Unroll_R(10) - R1024_8_rounds(10); - #endif - #if R1024_Unroll_R(11) - R1024_8_rounds(11); - #endif - #if R1024_Unroll_R(12) - R1024_8_rounds(12); - #endif - #if R1024_Unroll_R(13) - R1024_8_rounds(13); - #endif - #if R1024_Unroll_R(14) - R1024_8_rounds(14); - #endif - #if (SKEIN_UNROLL_1024 > 14) -#error "need more unrolling in Skein_1024_Process_Block" - #endif +#define R1024_8_rounds(R) /* do 8 full rounds */ \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_0, 8 * (R) + 1); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_1, 8 * (R) + 2); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_2, 8 * (R) + 3); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_3, 8 * (R) + 4); \ + I1024(2 * (R)); \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_4, 8 * (R) + 5); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_5, 8 * (R) + 6); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_6, 8 * (R) + 7); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_7, 8 * (R) + 8); \ + I1024(2 * (R) + 1); + + R1024_8_rounds(0); + +#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_1024 > (NN))) + +#if R1024_Unroll_R(1) + R1024_8_rounds(1); +#endif +#if R1024_Unroll_R(2) + R1024_8_rounds(2); +#endif +#if R1024_Unroll_R(3) + R1024_8_rounds(3); +#endif +#if R1024_Unroll_R(4) + R1024_8_rounds(4); +#endif +#if R1024_Unroll_R(5) + R1024_8_rounds(5); +#endif +#if R1024_Unroll_R(6) + R1024_8_rounds(6); +#endif +#if R1024_Unroll_R(7) + R1024_8_rounds(7); +#endif +#if R1024_Unroll_R(8) + R1024_8_rounds(8); +#endif +#if R1024_Unroll_R(9) + R1024_8_rounds(9); +#endif +#if R1024_Unroll_R(10) + R1024_8_rounds(10); +#endif +#if R1024_Unroll_R(11) + R1024_8_rounds(11); +#endif +#if R1024_Unroll_R(12) + R1024_8_rounds(12); +#endif +#if R1024_Unroll_R(13) + R1024_8_rounds(13); +#endif +#if R1024_Unroll_R(14) + R1024_8_rounds(14); +#endif +#if(SKEIN_UNROLL_1024 > 14) +#error "need more unrolling in Skein_1024_Process_Block" +#endif } /* do the final "feedforward" xor, update context chaining vars */ - ctx->X[ 0] = X00 ^ w[ 0]; - ctx->X[ 1] = X01 ^ w[ 1]; - ctx->X[ 2] = X02 ^ w[ 2]; - ctx->X[ 3] = X03 ^ w[ 3]; - ctx->X[ 4] = X04 ^ w[ 4]; - ctx->X[ 5] = X05 ^ w[ 5]; - ctx->X[ 6] = X06 ^ w[ 6]; - ctx->X[ 7] = X07 ^ w[ 7]; - ctx->X[ 8] = X08 ^ w[ 8]; - ctx->X[ 9] = X09 ^ w[ 9]; + ctx->X[0] = X00 ^ w[0]; + ctx->X[1] = X01 ^ w[1]; + ctx->X[2] = X02 ^ w[2]; + ctx->X[3] = X03 ^ w[3]; + ctx->X[4] = X04 ^ w[4]; + ctx->X[5] = X05 ^ w[5]; + ctx->X[6] = X06 ^ w[6]; + ctx->X[7] = X07 ^ w[7]; + ctx->X[8] = X08 ^ w[8]; + ctx->X[9] = X09 ^ w[9]; ctx->X[10] = X10 ^ w[10]; ctx->X[11] = X11 ^ w[11]; ctx->X[12] = X12 ^ w[12]; @@ -1155,30 +1300,28 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s ctx->X[14] = X14 ^ w[14]; ctx->X[15] = X15 ^ w[15]; - Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; blkPtr += SKEIN1024_BLOCK_BYTES; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; - } +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein1024_Process_Block_CodeSize(void) - { - return ((u08b_t *) Skein1024_Process_Block_CodeSize) - - ((u08b_t *) Skein1024_Process_Block); - } +{ + return ((u08b_t*)Skein1024_Process_Block_CodeSize) - + ((u08b_t*)Skein1024_Process_Block); +} static uint_t Skein1024_Unroll_Cnt(void) - { +{ return SKEIN_UNROLL_1024; - } +} #endif #endif - #if 0 /*****************************************************************/ /* 256-bit Skein */ @@ -1289,93 +1432,93 @@ static int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process the input bytes */ -static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) - { +static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt) +{ size_t n; - Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ /* process full blocks, if any */ - if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) + { + if(ctx->h.bCnt) /* finish up any buffered message data */ { - if (ctx->h.bCnt) /* finish up any buffered message data */ + n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if(n) { - n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ - if (n) - { - Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; - } + } Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES); - Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES); + Skein_256_Process_Block(ctx, ctx->b, 1, SKEIN_256_BLOCK_BYTES); ctx->h.bCnt = 0; - } + } /* now process any remaining full blocks, directly from input message data */ - if (msgByteCnt > SKEIN_256_BLOCK_BYTES) - { - n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */ - Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES); + if(msgByteCnt > SKEIN_256_BLOCK_BYTES) + { + n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */ + Skein_256_Process_Block(ctx, msg, n, SKEIN_256_BLOCK_BYTES); msgByteCnt -= n * SKEIN_256_BLOCK_BYTES; - msg += n * SKEIN_256_BLOCK_BYTES; - } - Skein_assert(ctx->h.bCnt == 0); + msg += n * SKEIN_256_BLOCK_BYTES; } + Skein_assert(ctx->h.bCnt == 0); + } /* copy any remaining source message data bytes into b[] */ - if (msgByteCnt) - { + if(msgByteCnt) + { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; - } + } return SKEIN_SUCCESS; - } +} /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize the hash computation and output the result */ -static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) - { - size_t i,n,byteCnt; +static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal) +{ + size_t i, n, byteCnt; u64b_t X[SKEIN_256_STATE_WORDS]; - Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ - ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ - if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if(ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); - Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */ /* now output the result */ - byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ - memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ - for (i=0;i < byteCnt;i += SKEIN_256_BLOCK_BYTES) - { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ - Skein_Start_New_Type(ctx,OUT_FINAL); - Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ - n = byteCnt - i; /* number of output bytes left to go */ - if (n >= SKEIN_256_BLOCK_BYTES) - n = SKEIN_256_BLOCK_BYTES; - Skein_Put64_LSB_First(hashVal+i,ctx->X,n); /* "output" the ctr mode bytes */ - Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ - } - return SKEIN_SUCCESS; + memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X, ctx->X, sizeof(X)); /* keep a local copy of counter mode "key" */ + for(i = 0; i < byteCnt; i += SKEIN_256_BLOCK_BYTES) + { + ((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */ + Skein_Start_New_Type(ctx, OUT_FINAL); + Skein_256_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i; /* number of output bytes left to go */ + if(n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i, ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, hashVal + i * SKEIN_256_BLOCK_BYTES); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } + return SKEIN_SUCCESS; +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_256_API_CodeSize(void) - { - return ((u08b_t *) Skein_256_API_CodeSize) - - ((u08b_t *) Skein_256_Init); - } +{ + return ((u08b_t*)Skein_256_API_CodeSize) - + ((u08b_t*)Skein_256_Init); +} #endif /*****************************************************************/ @@ -1384,47 +1527,54 @@ static size_t Skein_256_API_CodeSize(void) /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* init the context for a straight hashing operation */ -static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen) - { - union - { - u08b_t b[SKEIN_512_STATE_BYTES]; - u64b_t w[SKEIN_512_STATE_WORDS]; - } cfg; /* config block */ +static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen) +{ + union { + u08b_t b[SKEIN_512_STATE_BYTES]; + u64b_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ - Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); - ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ - switch (hashBitLen) - { /* use pre-computed values, where available */ + switch(hashBitLen) + { /* use pre-computed values, where available */ #ifndef SKEIN_NO_PRECOMP - case 512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); break; - case 384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X)); break; - case 256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X)); break; - case 224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X)); break; + case 512: + memcpy(ctx->X, SKEIN_512_IV_512, sizeof(ctx->X)); + break; + case 384: + memcpy(ctx->X, SKEIN_512_IV_384, sizeof(ctx->X)); + break; + case 256: + memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X)); + break; + case 224: + memcpy(ctx->X, SKEIN_512_IV_224, sizeof(ctx->X)); + break; #endif - default: - /* here if there is no precomputed IV value available */ - /* build/process the config block, type == CONFIG (could be precomputed) */ - Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ - - cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ - cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ - cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); - memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ - - /* compute the initial chaining values from config block */ - memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ - Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); - break; - } + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */ + Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ /* Set up to process the data message portion of the hash (default) */ - Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ return SKEIN_SUCCESS; - } +} #if 0 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ @@ -1489,93 +1639,93 @@ static int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process the input bytes */ -static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) - { +static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt) +{ size_t n; - Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ /* process full blocks, if any */ - if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + { + if(ctx->h.bCnt) /* finish up any buffered message data */ { - if (ctx->h.bCnt) /* finish up any buffered message data */ + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if(n) { - n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ - if (n) - { - Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; - } + } Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); - Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; - } + } /* now process any remaining full blocks, directly from input message data */ - if (msgByteCnt > SKEIN_512_BLOCK_BYTES) - { - n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ - Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + if(msgByteCnt > SKEIN_512_BLOCK_BYTES) + { + n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ + Skein_512_Process_Block(ctx, msg, n, SKEIN_512_BLOCK_BYTES); msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; - msg += n * SKEIN_512_BLOCK_BYTES; - } - Skein_assert(ctx->h.bCnt == 0); + msg += n * SKEIN_512_BLOCK_BYTES; } + Skein_assert(ctx->h.bCnt == 0); + } /* copy any remaining source message data bytes into b[] */ - if (msgByteCnt) - { + if(msgByteCnt) + { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; - } + } return SKEIN_SUCCESS; - } +} /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize the hash computation and output the result */ -static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) - { - size_t i,n,byteCnt; +static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal) +{ + size_t i, n, byteCnt; u64b_t X[SKEIN_512_STATE_WORDS]; - Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ - ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ - if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); - Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */ /* now output the result */ - byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ - memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ - for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) - { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ - Skein_Start_New_Type(ctx,OUT_FINAL); - Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ - n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ - if (n >= SKEIN_512_BLOCK_BYTES) - n = SKEIN_512_BLOCK_BYTES; - Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ - Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ - } - return SKEIN_SUCCESS; + memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X, ctx->X, sizeof(X)); /* keep a local copy of counter mode "key" */ + for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) + { + ((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */ + Skein_Start_New_Type(ctx, OUT_FINAL); + Skein_512_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if(n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(512, &ctx->h, n, hashVal + i * SKEIN_512_BLOCK_BYTES); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } + return SKEIN_SUCCESS; +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_512_API_CodeSize(void) - { - return ((u08b_t *) Skein_512_API_CodeSize) - - ((u08b_t *) Skein_512_Init); - } +{ + return ((u08b_t*)Skein_512_API_CodeSize) - + ((u08b_t*)Skein_512_Init); +} #endif /*****************************************************************/ @@ -1583,46 +1733,51 @@ static size_t Skein_512_API_CodeSize(void) /*****************************************************************/ /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* init the context for a straight hashing operation */ -static int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen) - { - union - { - u08b_t b[SKEIN1024_STATE_BYTES]; - u64b_t w[SKEIN1024_STATE_WORDS]; - } cfg; /* config block */ +static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen) +{ + union { + u08b_t b[SKEIN1024_STATE_BYTES]; + u64b_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ - Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); - ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ - switch (hashBitLen) - { /* use pre-computed values, where available */ + switch(hashBitLen) + { /* use pre-computed values, where available */ #ifndef SKEIN_NO_PRECOMP - case 512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break; - case 384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break; - case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break; + case 512: + memcpy(ctx->X, SKEIN1024_IV_512, sizeof(ctx->X)); + break; + case 384: + memcpy(ctx->X, SKEIN1024_IV_384, sizeof(ctx->X)); + break; + case 1024: + memcpy(ctx->X, SKEIN1024_IV_1024, sizeof(ctx->X)); + break; #endif - default: - /* here if there is no precomputed IV value available */ - /* build/process the config block, type == CONFIG (could be precomputed) */ - Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ - - cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ - cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ - cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); - memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ - - /* compute the initial chaining values from config block */ - memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ - Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); - break; - } + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */ + Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ /* Set up to process the data message portion of the hash (default) */ - Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ return SKEIN_SUCCESS; - } +} #if 0 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ @@ -1687,93 +1842,93 @@ static int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process the input bytes */ -static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) - { +static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt) +{ size_t n; - Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ /* process full blocks, if any */ - if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) + { + if(ctx->h.bCnt) /* finish up any buffered message data */ { - if (ctx->h.bCnt) /* finish up any buffered message data */ + n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if(n) { - n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ - if (n) - { - Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; - } + } Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES); - Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES); + Skein1024_Process_Block(ctx, ctx->b, 1, SKEIN1024_BLOCK_BYTES); ctx->h.bCnt = 0; - } + } /* now process any remaining full blocks, directly from input message data */ - if (msgByteCnt > SKEIN1024_BLOCK_BYTES) - { - n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */ - Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES); + if(msgByteCnt > SKEIN1024_BLOCK_BYTES) + { + n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */ + Skein1024_Process_Block(ctx, msg, n, SKEIN1024_BLOCK_BYTES); msgByteCnt -= n * SKEIN1024_BLOCK_BYTES; - msg += n * SKEIN1024_BLOCK_BYTES; - } - Skein_assert(ctx->h.bCnt == 0); + msg += n * SKEIN1024_BLOCK_BYTES; } + Skein_assert(ctx->h.bCnt == 0); + } /* copy any remaining source message data bytes into b[] */ - if (msgByteCnt) - { + if(msgByteCnt) + { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; - } + } return SKEIN_SUCCESS; - } +} /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize the hash computation and output the result */ -static int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) - { - size_t i,n,byteCnt; +static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal) +{ + size_t i, n, byteCnt; u64b_t X[SKEIN1024_STATE_WORDS]; - Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ - ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ - if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if(ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); - Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */ /* now output the result */ - byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ - memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ - for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++) - { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ - Skein_Start_New_Type(ctx,OUT_FINAL); - Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ - n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ - if (n >= SKEIN1024_BLOCK_BYTES) - n = SKEIN1024_BLOCK_BYTES; - Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ - Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ - } - return SKEIN_SUCCESS; + memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X, ctx->X, sizeof(X)); /* keep a local copy of counter mode "key" */ + for(i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) + { + ((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */ + Skein_Start_New_Type(ctx, OUT_FINAL); + Skein1024_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i * SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ + if(n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(1024, &ctx->h, n, hashVal + i * SKEIN1024_BLOCK_BYTES); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } + return SKEIN_SUCCESS; +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein1024_API_CodeSize(void) - { - return ((u08b_t *) Skein1024_API_CodeSize) - - ((u08b_t *) Skein1024_Init); - } +{ + return ((u08b_t*)Skein1024_API_CodeSize) - + ((u08b_t*)Skein1024_Init); +} #endif /**************** Functions to support MAC/tree hashing ***************/ @@ -1828,7 +1983,6 @@ static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) return SKEIN_SUCCESS; } - #if SKEIN_TREE_HASH /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* just do the OUTPUT stage */ @@ -1921,116 +2075,126 @@ static int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) typedef struct { - uint_t statebits; /* 256, 512, or 1024 */ - union - { - Skein_Ctxt_Hdr_t h; /* common header "overlay" */ - Skein_256_Ctxt_t ctx_256; - Skein_512_Ctxt_t ctx_512; - Skein1024_Ctxt_t ctx1024; - } u; -} -hashState; + uint_t statebits; /* 256, 512, or 1024 */ + union { + Skein_Ctxt_Hdr_t h; /* common header "overlay" */ + Skein_256_Ctxt_t ctx_256; + Skein_512_Ctxt_t ctx_512; + Skein1024_Ctxt_t ctx1024; + } u; +} hashState; /* "incremental" hashing API */ -static SkeinHashReturn Init (hashState *state, int hashbitlen); -static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen); -static SkeinHashReturn Final (hashState *state, SkeinBitSequence *hashval); +static SkeinHashReturn Init(hashState* state, int hashbitlen); +static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen); +static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval); /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* select the context size and init the context */ -static SkeinHashReturn Init(hashState *state, int hashbitlen) +static SkeinHashReturn Init(hashState* state, int hashbitlen) { #if SKEIN_256_NIST_MAX_HASH_BITS - if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS) - { - Skein_Assert(hashbitlen > 0,BAD_HASHLEN); - state->statebits = 64*SKEIN_256_STATE_WORDS; - return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen); - } -#endif - if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS) - { - state->statebits = 64*SKEIN_512_STATE_WORDS; - return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen); - } - else - { - state->statebits = 64*SKEIN1024_STATE_WORDS; - return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen); - } + if(hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS) + { + Skein_Assert(hashbitlen > 0, BAD_HASHLEN); + state->statebits = 64 * SKEIN_256_STATE_WORDS; + return Skein_256_Init(&state->u.ctx_256, (size_t)hashbitlen); + } +#endif + if(hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS) + { + state->statebits = 64 * SKEIN_512_STATE_WORDS; + return Skein_512_Init(&state->u.ctx_512, (size_t)hashbitlen); + } + else + { + state->statebits = 64 * SKEIN1024_STATE_WORDS; + return Skein1024_Init(&state->u.ctx1024, (size_t)hashbitlen); + } } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process data to be hashed */ -static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen) +static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen) { - /* only the final Update() call is allowed do partial bytes, else assert an error */ - Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL); + /* only the final Update() call is allowed do partial bytes, else assert an error */ + Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL); - Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,SKEIN_FAIL); - if ((databitlen & 7) == 0) /* partial bytes? */ - { - switch ((state->statebits >> 8) & 3) + Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, SKEIN_FAIL); + if((databitlen & 7) == 0) /* partial bytes? */ { - case 2: return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3); - case 1: return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3); - case 0: return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3); - default: return SKEIN_FAIL; + switch((state->statebits >> 8) & 3) + { + case 2: + return Skein_512_Update(&state->u.ctx_512, data, databitlen >> 3); + case 1: + return Skein_256_Update(&state->u.ctx_256, data, databitlen >> 3); + case 0: + return Skein1024_Update(&state->u.ctx1024, data, databitlen >> 3); + default: + return SKEIN_FAIL; + } } - } - else - { /* handle partial final byte */ - size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */ - u08b_t b,mask; + else + { /* handle partial final byte */ + size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */ + u08b_t b, mask; - mask = (u08b_t) (1u << (7 - (databitlen & 7))); /* partial byte bit mask */ - b = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask); /* apply bit padding on final byte */ + mask = (u08b_t)(1u << (7 - (databitlen & 7))); /* partial byte bit mask */ + b = (u08b_t)((data[bCnt - 1] & (0 - mask)) | mask); /* apply bit padding on final byte */ - switch ((state->statebits >> 8) & 3) - { - case 2: Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte */ - Skein_512_Update(&state->u.ctx_512,&b , 1 ); /* process the (masked) partial byte */ - break; - case 1: Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte */ - Skein_256_Update(&state->u.ctx_256,&b , 1 ); /* process the (masked) partial byte */ - break; - case 0: Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte */ - Skein1024_Update(&state->u.ctx1024,&b , 1 ); /* process the (masked) partial byte */ - break; - default: return SKEIN_FAIL; - } - Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */ + switch((state->statebits >> 8) & 3) + { + case 2: + Skein_512_Update(&state->u.ctx_512, data, bCnt - 1); /* process all but the final byte */ + Skein_512_Update(&state->u.ctx_512, &b, 1); /* process the (masked) partial byte */ + break; + case 1: + Skein_256_Update(&state->u.ctx_256, data, bCnt - 1); /* process all but the final byte */ + Skein_256_Update(&state->u.ctx_256, &b, 1); /* process the (masked) partial byte */ + break; + case 0: + Skein1024_Update(&state->u.ctx1024, data, bCnt - 1); /* process all but the final byte */ + Skein1024_Update(&state->u.ctx1024, &b, 1); /* process the (masked) partial byte */ + break; + default: + return SKEIN_FAIL; + } + Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */ - return SKEIN_SUCCESS; - } + return SKEIN_SUCCESS; + } } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize hash computation and output the result (hashbitlen bits) */ -static SkeinHashReturn Final(hashState *state, SkeinBitSequence *hashval) +static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval) { - Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL); - switch ((state->statebits >> 8) & 3) - { - case 2: return Skein_512_Final(&state->u.ctx_512,hashval); - case 1: return Skein_256_Final(&state->u.ctx_256,hashval); - case 0: return Skein1024_Final(&state->u.ctx1024,hashval); - default: return SKEIN_FAIL; - } + Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, FAIL); + switch((state->statebits >> 8) & 3) + { + case 2: + return Skein_512_Final(&state->u.ctx_512, hashval); + case 1: + return Skein_256_Final(&state->u.ctx_256, hashval); + case 0: + return Skein1024_Final(&state->u.ctx1024, hashval); + default: + return SKEIN_FAIL; + } } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* all-in-one hash function */ -SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, /* all-in-one call */ - SkeinDataLength databitlen,SkeinBitSequence *hashval) +SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data, /* all-in-one call */ + SkeinDataLength databitlen, SkeinBitSequence* hashval) { - hashState state; - SkeinHashReturn r = Init(&state,hashbitlen); - if (r == SKEIN_SUCCESS) - { /* these calls do not fail when called properly */ - r = Update(&state,data,databitlen); - Final(&state,hashval); - } - return r; + hashState state; + SkeinHashReturn r = Init(&state, hashbitlen); + if(r == SKEIN_SUCCESS) + { /* these calls do not fail when called properly */ + r = Update(&state, data, databitlen); + Final(&state, hashval); + } + return r; } diff --git a/xmrstak/backend/cpu/crypto/c_skein.h b/xmrstak/backend/cpu/crypto/c_skein.h index 1aa11dea3..52f359e82 100644 --- a/xmrstak/backend/cpu/crypto/c_skein.h +++ b/xmrstak/backend/cpu/crypto/c_skein.h @@ -1,5 +1,5 @@ #ifndef _SKEIN_H_ -#define _SKEIN_H_ 1 +#define _SKEIN_H_ 1 /************************************************************************** ** ** Interface declarations and internal definitions for Skein hashing. @@ -27,21 +27,20 @@ ** 1: return SKEIN_FAIL to flag errors ** ***************************************************************************/ -#include "skein_port.h" /* get platform-specific definitions */ +#include "skein_port.h" /* get platform-specific definitions */ typedef enum { - SKEIN_SUCCESS = 0, /* return codes from Skein calls */ - SKEIN_FAIL = 1, - SKEIN_BAD_HASHLEN = 2 -} -SkeinHashReturn; + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1, + SKEIN_BAD_HASHLEN = 2 +} SkeinHashReturn; -typedef uint32_t SkeinDataLength; /* bit count type */ -typedef u08b_t SkeinBitSequence; /* bit stream type */ +typedef uint32_t SkeinDataLength; /* bit count type */ +typedef u08b_t SkeinBitSequence; /* bit stream type */ /* "all-in-one" call */ -SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, - SkeinDataLength databitlen, SkeinBitSequence *hashval); +SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data, + SkeinDataLength databitlen, SkeinBitSequence* hashval); -#endif /* ifndef _SKEIN_H_ */ +#endif /* ifndef _SKEIN_H_ */ diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp index 5844d3814..2d333d118 100644 --- a/xmrstak/backend/cpu/crypto/cn_gpu.hpp +++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp @@ -4,8 +4,8 @@ #include #if defined(_WIN32) || defined(_WIN64) -#include #include +#include #define HAS_WIN_INTRIN_API #endif diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp index 79b38373a..efded74c8 100644 --- a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp +++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp @@ -1,12 +1,12 @@ -#include "cn_gpu.hpp" #include "../../cryptonight.hpp" +#include "cn_gpu.hpp" -#pragma GCC target ("avx2") +#pragma GCC target("avx2") #ifndef _mm256_bslli_epi128 - #define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) +#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) #endif #ifndef _mm256_bsrli_epi128 - #define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) +#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) #endif inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01) @@ -67,7 +67,7 @@ inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, // 112×4 = 448 template inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, - float lcnt, float hcnt, const __m256& rnd_c, __m256& sum) + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum) { __m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1); __m256 r = _mm256_setzero_ps(); @@ -92,7 +92,7 @@ inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& template inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, - float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out) + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out) { __m256i r = double_comupte(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum); if(rot != 0) @@ -101,9 +101,7 @@ inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256 out = _mm256_xor_si256(out, r); } - -inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n*16); } - +inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n * 16); } void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo) { @@ -155,7 +153,7 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& al sum1 = _mm256_add_ps(suma, sumb); out2 = _mm256_xor_si256(out2, out); - out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2); + out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2, out2, 0x41), out2); suma = _mm256_permute2f128_ps(sum0, sum1, 0x30); sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21); sum0 = _mm256_add_ps(suma, sumb); diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp index c8627d8b8..d65d9651e 100644 --- a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp +++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp @@ -1,7 +1,7 @@ -#include "cn_gpu.hpp" #include "../../cryptonight.hpp" +#include "cn_gpu.hpp" -#pragma GCC target ("sse2") +#pragma GCC target("sse2") inline void prep_dv(__m128i* idx, __m128i& v, __m128& n) { @@ -21,13 +21,13 @@ inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, { n1 = _mm_add_ps(n1, c); __m128 nn = _mm_mul_ps(n0, c); - nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn)); nn = fma_break(nn); n = _mm_add_ps(n, nn); n3 = _mm_sub_ps(n3, c); __m128 dd = _mm_mul_ps(n2, c); - dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd)); dd = fma_break(dd); d = _mm_add_ps(d, dd); @@ -57,12 +57,12 @@ inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d); d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d); - r =_mm_add_ps(r, _mm_div_ps(n,d)); + r = _mm_add_ps(r, _mm_div_ps(n, d)); } // 112×4 = 448 -template -inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +template +inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) { __m128 c = _mm_set1_ps(cnt); __m128 r = _mm_setzero_ps(); @@ -85,8 +85,8 @@ inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, floa return _mm_cvttps_epi32(r); } -template -inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +template +inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) { __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); if(rot != 0) @@ -94,7 +94,7 @@ inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, flo out = _mm_xor_si128(out, r); } -inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n*16); } +inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n * 16); } void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo) { diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h index 488805ec0..2a91269f8 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight.h +++ b/xmrstak/backend/cpu/crypto/cryptonight.h @@ -1,6 +1,6 @@ #pragma once -#include #include +#include #include "variant4_random_math.h" @@ -12,8 +12,8 @@ struct cryptonight_ctx; -typedef void (*cn_mainloop_fun)(cryptonight_ctx *ctx); -typedef void (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*); +typedef void (*cn_mainloop_fun)(cryptonight_ctx* ctx); +typedef void (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*); typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size); @@ -36,8 +36,7 @@ struct cryptonight_ctx int asm_version = 0; xmrstak_algo last_algo = invalid_algo; - union - { + union { extra_ctx_r cn_r_ctx; }; @@ -51,5 +50,3 @@ struct alloc_msg size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); void cryptonight_free_ctx(cryptonight_ctx* ctx); - - diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index d7316b25e..6c9e3390c 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -15,22 +15,24 @@ */ #pragma once -#include "cryptonight.h" -#include "xmrstak/backend/cryptonight.hpp" #include "../../miner_work.hpp" #include "cn_gpu.hpp" +#include "cryptonight.h" +#include "xmrstak/backend/cryptonight.hpp" +#include #include #include -#include #include #ifdef _WIN64 -# include -# include -# include -# include +#include +// this comment disable clang include reordering +#include +#include +// this comment disable clang include reordering for windows.h +#include #else -# include +#include #endif #ifdef __GNUC__ @@ -54,9 +56,9 @@ static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi) extern "C" { - void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen); + void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen); void keccakf(uint64_t st[25], int rounds); - extern void(*const extra_hashes[4])(const void *, uint32_t, char *); + extern void (*const extra_hashes[4])(const void*, uint32_t, char*); } // This will shift and xor tmp1 into itself as 4 32-bit vals such as @@ -73,7 +75,7 @@ static inline __m128i sl_xor(__m128i tmp1) return tmp1; } -template +template static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2) { __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon); @@ -98,14 +100,14 @@ static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t r *xout2 = _mm_xor_si128(*xout2, xout1); } -template +template static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) { __m128i xout0, xout2; xout0 = _mm_load_si128(memory); - xout2 = _mm_load_si128(memory+1); + xout2 = _mm_load_si128(memory + 1); *k0 = xout0; *k1 = xout2; @@ -175,7 +177,7 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3 x7 = _mm_xor_si128(x7, tmp0); } -template +template void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo) { constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast; @@ -197,7 +199,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ if(HEAVY_MIX) { - for(size_t i=0; i < 16; i++) + for(size_t i = 0; i < 16; i++) { if(SOFT_AES) { @@ -230,7 +232,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ } const size_t MEM = algo.Mem(); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(SOFT_AES) { @@ -277,29 +279,29 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ } } -template +template void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrstak_algo& algo) { constexpr size_t hash_size = 200; // 25x8 bytes alignas(128) uint64_t hash[25]; const size_t mem = algo.Mem(); - for (uint64_t i = 0; i < mem / 512; i++) + for(uint64_t i = 0; i < mem / 512; i++) { memcpy(hash, input, hash_size); hash[0] ^= i; keccakf(hash, 24); memcpy(output, hash, 160); - output+=160; + output += 160; keccakf(hash, 24); memcpy(output, hash, 176); - output+=176; + output += 176; keccakf(hash, 24); memcpy(output, hash, 176); - output+=176; + output += 176; if(PREFETCH) { @@ -311,11 +313,11 @@ void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrs } } -template +template void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo) { constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || - ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu; + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu; // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; @@ -333,7 +335,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ xout7 = _mm_load_si128(output + 11); const size_t MEM = algo.Mem(); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(PREFETCH) _mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA); @@ -384,7 +386,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ if(HEAVY_MIX) { - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(PREFETCH) _mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA); @@ -433,7 +435,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } - for(size_t i=0; i < 16; i++) + for(size_t i = 0; i < 16; i++) { if(SOFT_AES) { @@ -494,7 +496,8 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) #else // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence // Fallback to simpler code - if (x2 < n0) ++r; + if(x2 < n0) + ++r; #endif return r; } @@ -505,7 +508,7 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) alignas(16) uint32_t x[4]; _mm_store_si128((__m128i*)k, key); _mm_store_si128((__m128i*)x, _mm_xor_si128(val, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))); // x = ~val - #define BYTE(p, i) ((unsigned char*)&p)[i] +#define BYTE(p, i) ((unsigned char*)&p)[i] k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; x[0] ^= k[0]; k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; @@ -513,11 +516,11 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; x[2] ^= k[2]; k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; - #undef BYTE +#undef BYTE return _mm_load_si128((__m128i*)k); } -template +template inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) { mem_out[0] = _mm_cvtsi128_si64(tmp); @@ -541,7 +544,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) mem_out[1] = vh; } - } /** optimal type for sqrt @@ -550,18 +552,18 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) * * @tparam N number of hashes per thread */ -template +template struct GetOptimalSqrtType { using type = __m128i; }; -template<> +template <> struct GetOptimalSqrtType<1u> { using type = uint64_t; }; -template +template using GetOptimalSqrtType_t = typename GetOptimalSqrtType::type; /** assign a value and convert if necessary @@ -625,273 +627,275 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc)); } -#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \ - /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ +#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ - if (ALGO == cryptonight_r) \ - cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \ - } \ - if(ALGO == cryptonight_v8_reversewaltz) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + if(ALGO == cryptonight_r) \ + cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \ + } \ + if(ALGO == cryptonight_v8_reversewaltz) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } -#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ - /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - hi ^= ((uint64_t*)&chunk2)[0]; \ - lo ^= ((uint64_t*)&chunk2)[1]; \ - const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ - } \ - if(ALGO == cryptonight_v8_reversewaltz) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - hi ^= ((uint64_t*)&chunk2)[0]; \ - lo ^= ((uint64_t*)&chunk2)[1]; \ - const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ +#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } \ + if(ALGO == cryptonight_v8_reversewaltz) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } -#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ - { \ - uint64_t sqrt_result_tmp; \ - assign(sqrt_result_tmp, sqrt_result); \ - /* Use division and square root results from the _previous_ iteration to hide the latency */ \ - const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ - cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ - const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ - /* Most and least significant bits in the divisor are set to 1 \ - * to make sure we don't divide by a small or even number, \ - * so there are no shortcuts for such cases \ - * \ - * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ - * We drop the highest bit to fit both quotient and remainder in 32 bits \ - */ \ - /* Compiler will optimize it to a single div instruction */ \ - const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ - const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ - division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ +#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ + { \ + uint64_t sqrt_result_tmp; \ + assign(sqrt_result_tmp, sqrt_result); \ + /* Use division and square root results from the _previous_ iteration to hide the latency */ \ + const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ + const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ + /* Most and least significant bits in the divisor are set to 1 \ + * to make sure we don't divide by a small or even number, \ + * so there are no shortcuts for such cases \ + * \ + * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ + * We drop the highest bit to fit both quotient and remainder in 32 bits \ + */ \ + /* Compiler will optimize it to a single div instruction */ \ + const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ + division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ - assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ + assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ } -#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \ - if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ - { \ +#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \ + if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ + { \ cl ^= (cn_r_data[0] + cn_r_data[1]) | ((uint64_t)(cn_r_data[2] + cn_r_data[3]) << 32); \ - cn_r_data[4] = static_cast(al); \ - cn_r_data[5] = static_cast(ah); \ - cn_r_data[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ - cn_r_data[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ - cn_r_data[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ - v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \ - } \ - if (ALGO == cryptonight_r) \ - { \ - al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \ - ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \ + cn_r_data[4] = static_cast(al); \ + cn_r_data[5] = static_cast(ah); \ + cn_r_data[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ + cn_r_data[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ + cn_r_data[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ + v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \ + } \ + if(ALGO == cryptonight_r) \ + { \ + al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \ + ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \ } -#define CN_INIT_SINGLE \ +#define CN_INIT_SINGLE \ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ - { \ - memset(output, 0, 32 * N); \ - return; \ + { \ + memset(output, 0, 32 * N); \ + return; \ } -#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \ - keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ - uint64_t monero_const; \ +#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \ + keccak((const uint8_t*)input + len * n, len, ctx[n]->hash_state, 200); \ + uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - { \ - monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \ - monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ - } \ - /* Optim - 99% time boundary */ \ - cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \ - \ - __m128i ax0; \ - uint64_t idx0; \ - __m128i bx0; \ - uint8_t* l0 = ctx[n]->long_state; \ - /* BEGIN cryptonight_monero_v8 variables */ \ - __m128i bx1; \ - __m128i division_result_xmm; \ - __m128 conc_var; \ - if(ALGO == cryptonight_conceal) \ - {\ - set_float_rounding_mode_nearest(); \ - conc_var = _mm_setzero_ps(); \ - }\ - GetOptimalSqrtType_t sqrt_result; \ - uint32_t cn_r_data[9]; \ - /* END cryptonight_monero_v8 variables */ \ - { \ - uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ - idx0 = h0[0] ^ h0[4]; \ - ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ - bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ - { \ - bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ - division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ - assign(sqrt_result, h0[13]); \ - set_float_rounding_mode(); \ - } \ - if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ - { \ - bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ - cn_r_data[0] = (uint32_t)(h0[12]); \ - cn_r_data[1] = (uint32_t)(h0[12] >> 32); \ - cn_r_data[2] = (uint32_t)(h0[13]); \ - cn_r_data[3] = (uint32_t)(h0[13] >> 32); \ - } \ - } \ - __m128i *ptr0 + { \ + monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \ + monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ + } \ + /* Optim - 99% time boundary */ \ + cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \ + \ + __m128i ax0; \ + uint64_t idx0; \ + __m128i bx0; \ + uint8_t* l0 = ctx[n]->long_state; \ + /* BEGIN cryptonight_monero_v8 variables */ \ + __m128i bx1; \ + __m128i division_result_xmm; \ + __m128 conc_var; \ + if(ALGO == cryptonight_conceal) \ + { \ + set_float_rounding_mode_nearest(); \ + conc_var = _mm_setzero_ps(); \ + } \ + GetOptimalSqrtType_t sqrt_result; \ + uint32_t cn_r_data[9]; \ + /* END cryptonight_monero_v8 variables */ \ + { \ + uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ + idx0 = h0[0] ^ h0[4]; \ + ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ + bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ + assign(sqrt_result, h0[13]); \ + set_float_rounding_mode(); \ + } \ + if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + cn_r_data[0] = (uint32_t)(h0[12]); \ + cn_r_data[1] = (uint32_t)(h0[12] >> 32); \ + cn_r_data[2] = (uint32_t)(h0[13]); \ + cn_r_data[3] = (uint32_t)(h0[13] >> 32); \ + } \ + } \ + __m128i* ptr0 #define CN_STEP1(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ - __m128i cx; \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - cx = _mm_load_si128(ptr0); \ - if (ALGO == cryptonight_conceal) \ - cryptonight_conceal_tweak(cx, conc_var); \ - if (ALGO == cryptonight_bittube2) \ - { \ - cx = aes_round_bittube2(cx, ax0); \ - } \ - else \ - { \ - if(SOFT_AES) \ - cx = soft_aesenc(cx, ax0); \ - else \ - cx = _mm_aesenc_si128(cx, ax0); \ - } \ + __m128i cx; \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + cx = _mm_load_si128(ptr0); \ + if(ALGO == cryptonight_conceal) \ + cryptonight_conceal_tweak(cx, conc_var); \ + if(ALGO == cryptonight_bittube2) \ + { \ + cx = aes_round_bittube2(cx, ax0); \ + } \ + else \ + { \ + if(SOFT_AES) \ + cx = soft_aesenc(cx, ax0); \ + else \ + cx = _mm_aesenc_si128(cx, ax0); \ + } \ CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) -#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ +#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \ - else \ - _mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \ - idx0 = _mm_cvtsi128_si64(cx); \ - \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz) \ - bx0 = cx + cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \ + else \ + _mm_store_si128((__m128i*)ptr0, _mm_xor_si128(bx0, cx)); \ + idx0 = _mm_cvtsi128_si64(cx); \ + \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ + if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz) \ + bx0 = cx #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data) \ - uint64_t lo, cl, ch; \ - uint64_t al0 = _mm_cvtsi128_si64(ax0); \ - uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ - cl = ((uint64_t*)ptr0)[0]; \ - ch = ((uint64_t*)ptr0)[1]; \ - CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \ - CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ - { \ - uint64_t hi; \ - lo = _umul128(idx0, cl, &hi); \ - if(ALGO == cryptonight_r) \ - { \ - CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \ - } \ - else \ - { \ - CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ - } \ - ah0 += lo; \ - al0 += hi; \ - } \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz) \ - { \ - bx1 = bx0; \ - bx0 = cx; \ - } \ - ((uint64_t*)ptr0)[0] = al0; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr0, _MM_HINT_T0) - -#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - { \ - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ - ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \ - else \ - ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \ - } \ - else \ - ((uint64_t*)ptr0)[1] = ah0; \ - al0 ^= cl; \ - ah0 ^= ch; \ - ax0 = _mm_set_epi64x(ah0, al0); \ + uint64_t lo, cl, ch; \ + uint64_t al0 = _mm_cvtsi128_si64(ax0); \ + uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ + cl = ((uint64_t*)ptr0)[0]; \ + ch = ((uint64_t*)ptr0)[1]; \ + CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \ + CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ + { \ + uint64_t hi; \ + lo = _umul128(idx0, cl, &hi); \ + if(ALGO == cryptonight_r) \ + { \ + CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \ + } \ + else \ + { \ + CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ + } \ + ah0 += lo; \ + al0 += hi; \ + } \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ + ((uint64_t*)ptr0)[0] = al0; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0) + +#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \ + else \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \ + } \ + else \ + ((uint64_t*)ptr0)[1] = ah0; \ + al0 ^= cl; \ + ah0 ^= ch; \ + ax0 = _mm_set_epi64x(ah0, al0); \ idx0 = al0; -#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ - { \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - int64_t u = ((int64_t*)ptr0)[0]; \ - int32_t d = ((int32_t*)ptr0)[2]; \ - int64_t q = u / (d | 0x5); \ - \ - ((int64_t*)ptr0)[0] = u ^ q; \ - idx0 = d ^ q; \ - } \ +#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ + if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ + { \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = d ^ q; \ + } \ else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \ - { \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - int64_t u = ((int64_t*)ptr0)[0]; \ - int32_t d = ((int32_t*)ptr0)[2]; \ - int64_t q = u / (d | 0x5); \ - \ - ((int64_t*)ptr0)[0] = u ^ q; \ - idx0 = (~d) ^ q; \ + { \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = (~d) ^ q; \ } -#define CN_FINALIZE(n) \ - /* Optim - 90% time boundary */ \ +#define CN_FINALIZE(n) \ + /* Optim - 90% time boundary */ \ cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state, algo); \ - /* Optim - 99% time boundary */ \ - keccakf((uint64_t*)ctx[n]->hash_state, 24); \ + /* Optim - 99% time boundary */ \ + keccakf((uint64_t*)ctx[n]->hash_state, 24); \ extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n) //! defer the evaluation of an macro #ifndef _MSC_VER -# define CN_DEFER(...) __VA_ARGS__ +#define CN_DEFER(...) __VA_ARGS__ #else -# define CN_EMPTY(...) -# define CN_DEFER(...) __VA_ARGS__ CN_EMPTY() +#define CN_EMPTY(...) +#define CN_DEFER(...) __VA_ARGS__ CN_EMPTY() #endif //! execute the macro f with the passed arguments -#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__) +#define CN_EXEC(f, ...) \ + CN_DEFER(f) \ + (__VA_ARGS__) /** add append n to all arguments and keeps n as first argument * @@ -904,22 +908,22 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) * @endcode */ #define CN_ENUM_0(n, ...) n -#define CN_ENUM_1(n, x1) n, x1 ## n -#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n -#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n -#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n -#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n -#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n -#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n -#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n -#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n -#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n -#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n -#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n -#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n -#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n -#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n -#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n, x16 ## n +#define CN_ENUM_1(n, x1) n, x1##n +#define CN_ENUM_2(n, x1, x2) n, x1##n, x2##n +#define CN_ENUM_3(n, x1, x2, x3) n, x1##n, x2##n, x3##n +#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1##n, x2##n, x3##n, x4##n +#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1##n, x2##n, x3##n, x4##n, x5##n +#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n +#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n +#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n +#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n +#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n +#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n +#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n +#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n +#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n +#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n +#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n, x16##n /** repeat a macro call multiple times * @@ -933,21 +937,35 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) * f(0, foo0, bar); f(1, foo1, bar1) * @endcode */ -#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)) -#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)) -#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)) -#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)) -#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__)) - -template< size_t N> +#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)) +#define REPEAT_2(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)) +#define REPEAT_3(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)) +#define REPEAT_4(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__)) +#define REPEAT_5(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(4, __VA_ARGS__)) + +template struct Cryptonight_hash; -template< > +template <> struct Cryptonight_hash<1> { static constexpr size_t N = 1; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -971,12 +989,12 @@ struct Cryptonight_hash<1> } }; -template< > +template <> struct Cryptonight_hash<2> { static constexpr size_t N = 2; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1000,12 +1018,12 @@ struct Cryptonight_hash<2> } }; -template< > +template <> struct Cryptonight_hash<3> { static constexpr size_t N = 3; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1029,12 +1047,12 @@ struct Cryptonight_hash<3> } }; -template< > +template <> struct Cryptonight_hash<4> { static constexpr size_t N = 4; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1058,12 +1076,12 @@ struct Cryptonight_hash<4> } }; -template< > +template <> struct Cryptonight_hash<5> { static constexpr size_t N = 5; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1087,26 +1105,25 @@ struct Cryptonight_hash<5> } }; -extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); -extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); - -template< size_t N, size_t asm_version> +template struct Cryptonight_hash_asm { - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { for(size_t i = 0; i < N; ++i) { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200); cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo); } if(ALGO == cryptonight_r) { // API ATTRIBUTE is only required for cryptonight_r - typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx *ctx); + typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx * ctx); for(size_t i = 0; i < N; ++i) reinterpret_cast(ctx[0]->loop_fn)(ctx[i]); // use always loop_fn from ctx[0]!! } @@ -1126,19 +1143,19 @@ struct Cryptonight_hash_asm }; // double hash with specialized asm only for intel -template< > +template <> struct Cryptonight_hash_asm<2, 0> { static constexpr size_t N = 2; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const size_t MEM = algo.Mem(); for(size_t i = 0; i < N; ++i) { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200); /* Optim - 99% time boundary */ cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo); } @@ -1167,89 +1184,90 @@ struct Cryptonight_hash_asm<2, 0> namespace { -template +template static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask) { - const uint8_t* p = reinterpret_cast(src); - - // Workaround for Visual Studio placing trampoline in debug builds. -# if defined(_MSC_VER) - if (p[0] == 0xE9) { - p += *(int32_t*)(p + 1) + 5; - } -# endif - - size_t size = 0; - while (*(uint32_t*)(p + size) != 0xDEADC0DE) { - ++size; - } - size += sizeof(uint32_t); - - memcpy((void*) dst, (const void*) src, size); - - uint8_t* patched_data = reinterpret_cast(dst); - for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) { - switch (*(uint32_t*)(patched_data + i)) { - case CN_ITER: - *(uint32_t*)(patched_data + i) = iterations; - break; - - case CN_MASK: - *(uint32_t*)(patched_data + i) = mask; - break; - } - } -} + const uint8_t* p = reinterpret_cast(src); + + // Workaround for Visual Studio placing trampoline in debug builds. +#if defined(_MSC_VER) + if(p[0] == 0xE9) + { + p += *(int32_t*)(p + 1) + 5; + } +#endif + + size_t size = 0; + while(*(uint32_t*)(p + size) != 0xDEADC0DE) + { + ++size; + } + size += sizeof(uint32_t); + + memcpy((void*)dst, (const void*)src, size); + + uint8_t* patched_data = reinterpret_cast(dst); + for(size_t i = 0; i + sizeof(uint32_t) <= size; ++i) + { + switch(*(uint32_t*)(patched_data + i)) + { + case CN_ITER: + *(uint32_t*)(patched_data + i) = iterations; + break; + case CN_MASK: + *(uint32_t*)(patched_data + i) = mask; + break; + } + } +} void* allocateExecutableMemory(size_t size) { #ifdef _WIN64 -return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); + return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); +#else +#if defined(__APPLE__) + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); #else -# if defined(__APPLE__) - return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); -# else - return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -# endif + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#endif #endif } - -void protectExecutableMemory(void *p, size_t size) +void protectExecutableMemory(void* p, size_t size) { #ifdef _WIN64 - DWORD oldProtect; - VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect); + DWORD oldProtect; + VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect); #else - mprotect(p, size, PROT_READ | PROT_EXEC); + mprotect(p, size, PROT_READ | PROT_EXEC); #endif } -void unprotectExecutableMemory(void *p, size_t size) +void unprotectExecutableMemory(void* p, size_t size) { #ifdef _WIN64 - DWORD oldProtect; - VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect); + DWORD oldProtect; + VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect); #else - mprotect(p, size, PROT_WRITE | PROT_EXEC); + mprotect(p, size, PROT_WRITE | PROT_EXEC); #endif } - -void flushInstructionCache(void *p, size_t size) +void flushInstructionCache(void* p, size_t size) { #ifdef _WIN64 - ::FlushInstructionCache(GetCurrentProcess(), p, size); + ::FlushInstructionCache(GetCurrentProcess(), p, size); #else -# ifndef __FreeBSD__ - __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); -# endif +#ifndef __FreeBSD__ + __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); +#endif #endif } -template +template void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t Iter = algo.Iter(); @@ -1270,7 +1288,8 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr if(N == 2) src_code = reinterpret_cast(cryptonight_v8_double_mainloop_sandybridge_asm); else - src_code = cryptonight_v8_mainloop_ivybridge_asm;; + src_code = cryptonight_v8_mainloop_ivybridge_asm; + ; } // supports only 1 thread per hash if(selected_asm == "amd_avx") @@ -1295,19 +1314,17 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr flushInstructionCache(ctx[0]->fun_data, allocation_size); } } -} // namespace (anonymous) - - +} // namespace struct Cryptonight_hash_gpu { static constexpr size_t N = 1; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { set_float_rounding_mode_nearest(); - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + keccak((const uint8_t*)input, len, ctx[0]->hash_state, 200); cn_explode_scratchpad_gpu(ctx[0]->hash_state, ctx[0]->long_state, algo); if(cngpu_check_avx2()) @@ -1321,16 +1338,15 @@ struct Cryptonight_hash_gpu } }; -template +template struct Cryptonight_R_generator { - template + template static void cn_on_new_job(const xmrstak::miner_work& work, cryptonight_ctx** ctx) { if(ctx[0]->cn_r_ctx.height == work.iBlockHeight && ctx[0]->last_algo == POW(cryptonight_r) && - reinterpret_cast(ctx[0]->hash_fn) == ctx[0]->fun_data - ) + reinterpret_cast(ctx[0]->hash_fn) == ctx[0]->fun_data) return; ctx[0]->last_algo = POW(cryptonight_r); @@ -1346,7 +1362,7 @@ struct Cryptonight_R_generator ctx[0]->hash_fn = Cryptonight_hash_asm::template hash; } - for(size_t i=1; i < N; i++) + for(size_t i = 1; i < N; i++) { ctx[i]->cn_r_ctx = ctx[0]->cn_r_ctx; ctx[i]->loop_fn = ctx[0]->loop_fn; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index a9d1c96fd..e35c7c7b8 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -23,19 +23,19 @@ extern "C" { -#include "c_groestl.h" #include "c_blake256.h" +#include "c_groestl.h" #include "c_jh.h" #include "c_skein.h" } -#include "xmrstak/backend/cryptonight.hpp" #include "cryptonight.h" #include "cryptonight_aesni.h" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" +#include #include #include -#include #ifdef __GNUC__ #include @@ -49,30 +49,35 @@ extern "C" #ifdef _WIN32 #include +// this comment avoid that clang format reorders the includes #include #else -#include #include #include +#include #endif // _WIN32 -void do_blake_hash(const void* input, uint32_t len, char* output) { +void do_blake_hash(const void* input, uint32_t len, char* output) +{ blake256_hash((uint8_t*)output, (const uint8_t*)input, len); } -void do_groestl_hash(const void* input, uint32_t len, char* output) { +void do_groestl_hash(const void* input, uint32_t len, char* output) +{ groestl((const uint8_t*)input, len * 8, (uint8_t*)output); } -void do_jh_hash(const void* input, uint32_t len, char* output) { +void do_jh_hash(const void* input, uint32_t len, char* output) +{ jh_hash(32 * 8, (const uint8_t*)input, 8 * len, (uint8_t*)output); } -void do_skein_hash(const void* input, uint32_t len, char* output) { +void do_skein_hash(const void* input, uint32_t len, char* output) +{ skein_hash(8 * 32, (const uint8_t*)input, 8 * len, (uint8_t*)output); } -void (* const extra_hashes[4])(const void *, uint32_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; +void (*const extra_hashes[4])(const void*, uint32_t, char*) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; #ifdef _WIN32 #include "xmrstak/misc/uac.hpp" @@ -81,21 +86,21 @@ BOOL bRebootDesirable = FALSE; //If VirtualAlloc fails, suggest a reboot BOOL AddPrivilege(TCHAR* pszPrivilege) { - HANDLE hToken; + HANDLE hToken; TOKEN_PRIVILEGES tp; - BOOL status; + BOOL status; - if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) + if(!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) return FALSE; - if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) + if(!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) return FALSE; tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); - if (!status || (GetLastError() != ERROR_SUCCESS)) + if(!status || (GetLastError() != ERROR_SUCCESS)) return FALSE; CloseHandle(hToken); @@ -107,19 +112,19 @@ BOOL AddLargePageRights() HANDLE hToken; PTOKEN_USER user = NULL; - if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE) + if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE) { TOKEN_ELEVATION Elevation; DWORD cbSize = sizeof(TOKEN_ELEVATION); BOOL bIsElevated = FALSE; - if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) + if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) bIsElevated = Elevation.TokenIsElevated; DWORD size = 0; GetTokenInformation(hToken, TokenUser, NULL, 0, &size); - if (size > 0 && bIsElevated) + if(size > 0 && bIsElevated) { user = (PTOKEN_USER)LocalAlloc(LPTR, size); GetTokenInformation(hToken, TokenUser, user, size, &size); @@ -128,7 +133,7 @@ BOOL AddLargePageRights() CloseHandle(hToken); } - if (!user) + if(!user) return FALSE; LSA_HANDLE handle; @@ -136,7 +141,7 @@ BOOL AddLargePageRights() ZeroMemory(&attributes, sizeof(attributes)); BOOL result = FALSE; - if (LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0) + if(LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0) { LSA_UNICODE_STRING lockmem; lockmem.Buffer = L"SeLockMemoryPrivilege"; @@ -146,11 +151,11 @@ BOOL AddLargePageRights() PLSA_UNICODE_STRING rights = NULL; ULONG cnt = 0; BOOL bHasRights = FALSE; - if (LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0) + if(LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0) { - for (size_t i = 0; i < cnt; i++) + for(size_t i = 0; i < cnt; i++) { - if (rights[i].Length == lockmem.Length && + if(rights[i].Length == lockmem.Length && memcmp(rights[i].Buffer, lockmem.Buffer, 42) == 0) { bHasRights = TRUE; @@ -220,7 +225,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ptr->ctx_info[0] = 0; ptr->ctx_info[1] = 0; if(ptr->long_state == NULL) - printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str()); + printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte", std::to_string(hashMemSize).c_str()); return ptr; } @@ -250,7 +255,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al #else //http://man7.org/linux/man-pages/man2/mmap.2.html #if defined(__APPLE__) - ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); #elif defined(__FreeBSD__) ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, @@ -261,7 +266,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al #else ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); - if (ptr->long_state == MAP_FAILED) + if(ptr->long_state == MAP_FAILED) { // try without MAP_HUGETLB for crappy kernels msg->warning = "mmap with HUGETLB failed, attempting without it (you should fix your kernel)"; @@ -270,7 +275,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al } #endif - if (ptr->long_state == MAP_FAILED) + if(ptr->long_state == MAP_FAILED) { _mm_free(ptr); msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'"; @@ -279,7 +284,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ptr->ctx_info[0] = 1; - if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM|MADV_WILLNEED) != 0) + if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM | MADV_WILLNEED) != 0) msg->warning = "madvise failed"; ptr->ctx_info[1] = 0; diff --git a/xmrstak/backend/cpu/crypto/groestl_tables.h b/xmrstak/backend/cpu/crypto/groestl_tables.h index a23295c35..85dd25f3d 100644 --- a/xmrstak/backend/cpu/crypto/groestl_tables.h +++ b/xmrstak/backend/cpu/crypto/groestl_tables.h @@ -1,38 +1,6 @@ #ifndef __tables_h #define __tables_h - -const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc -, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5 -, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d -, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded -, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1 -, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441 -, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4 -, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba -, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616 -, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2 -, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c -, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de -, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7 -, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e -, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c -, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7 -, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b -, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4 -, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e -, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a -, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37 -, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86 -, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b -, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028 -, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3 -, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94 -, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836 -, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0 -, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2 -, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e -, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3 -, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e}; +const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e}; #endif /* __tables_h */ diff --git a/xmrstak/backend/cpu/crypto/hash.h b/xmrstak/backend/cpu/crypto/hash.h index 2af330932..574581376 100644 --- a/xmrstak/backend/cpu/crypto/hash.h +++ b/xmrstak/backend/cpu/crypto/hash.h @@ -4,4 +4,9 @@ typedef unsigned char BitSequence; typedef uint32_t DataLength; -typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn; +typedef enum +{ + SUCCESS = 0, + FAIL = 1, + BAD_HASHLEN = 2 +} HashReturn; diff --git a/xmrstak/backend/cpu/crypto/int-util.h b/xmrstak/backend/cpu/crypto/int-util.h index 8748976c1..393b4f3d2 100644 --- a/xmrstak/backend/cpu/crypto/int-util.h +++ b/xmrstak/backend/cpu/crypto/int-util.h @@ -12,43 +12,51 @@ #if defined(_MSC_VER) #include -static inline uint32_t rol32(uint32_t x, int r) { +static inline uint32_t rol32(uint32_t x, int r) +{ static_assert(sizeof(uint32_t) == sizeof(unsigned int), "this code assumes 32-bit integers"); return _rotl(x, r); } -static inline uint64_t rol64(uint64_t x, int r) { +static inline uint64_t rol64(uint64_t x, int r) +{ return _rotl64(x, r); } #else -static inline uint32_t rol32(uint32_t x, int r) { +static inline uint32_t rol32(uint32_t x, int r) +{ return (x << (r & 31)) | (x >> (-r & 31)); } -static inline uint64_t rol64(uint64_t x, int r) { +static inline uint64_t rol64(uint64_t x, int r) +{ return (x << (r & 63)) | (x >> (-r & 63)); } #endif -static inline uint64_t hi_dword(uint64_t val) { +static inline uint64_t hi_dword(uint64_t val) +{ return val >> 32; } -static inline uint64_t lo_dword(uint64_t val) { +static inline uint64_t lo_dword(uint64_t val) +{ return val & 0xFFFFFFFF; } -static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) { +static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) +{ dividend |= ((uint64_t)*remainder) << 32; *remainder = dividend % divisor; return dividend / divisor; } // Long division with 2^32 base -static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) { +static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) +{ uint64_t dividend_dwords[4]; uint32_t remainder = 0; @@ -65,30 +73,35 @@ static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uin return remainder; } -#define IDENT32(x) ((uint32_t) (x)) -#define IDENT64(x) ((uint64_t) (x)) +#define IDENT32(x) ((uint32_t)(x)) +#define IDENT64(x) ((uint64_t)(x)) -#define SWAP32(x) ((((uint32_t) (x) & 0x000000ff) << 24) | \ - (((uint32_t) (x) & 0x0000ff00) << 8) | \ - (((uint32_t) (x) & 0x00ff0000) >> 8) | \ - (((uint32_t) (x) & 0xff000000) >> 24)) -#define SWAP64(x) ((((uint64_t) (x) & 0x00000000000000ff) << 56) | \ - (((uint64_t) (x) & 0x000000000000ff00) << 40) | \ - (((uint64_t) (x) & 0x0000000000ff0000) << 24) | \ - (((uint64_t) (x) & 0x00000000ff000000) << 8) | \ - (((uint64_t) (x) & 0x000000ff00000000) >> 8) | \ - (((uint64_t) (x) & 0x0000ff0000000000) >> 24) | \ - (((uint64_t) (x) & 0x00ff000000000000) >> 40) | \ - (((uint64_t) (x) & 0xff00000000000000) >> 56)) +#define SWAP32(x) ((((uint32_t)(x)&0x000000ff) << 24) | \ + (((uint32_t)(x)&0x0000ff00) << 8) | \ + (((uint32_t)(x)&0x00ff0000) >> 8) | \ + (((uint32_t)(x)&0xff000000) >> 24)) +#define SWAP64(x) ((((uint64_t)(x)&0x00000000000000ff) << 56) | \ + (((uint64_t)(x)&0x000000000000ff00) << 40) | \ + (((uint64_t)(x)&0x0000000000ff0000) << 24) | \ + (((uint64_t)(x)&0x00000000ff000000) << 8) | \ + (((uint64_t)(x)&0x000000ff00000000) >> 8) | \ + (((uint64_t)(x)&0x0000ff0000000000) >> 24) | \ + (((uint64_t)(x)&0x00ff000000000000) >> 40) | \ + (((uint64_t)(x)&0xff00000000000000) >> 56)) -static inline uint32_t ident32(uint32_t x) { return x; } +static inline uint32_t ident32(uint32_t x) +{ + return x; +} static inline uint64_t ident64(uint64_t x) { return x; } -static inline uint32_t swap32(uint32_t x) { +static inline uint32_t swap32(uint32_t x) +{ x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8); return (x << 16) | (x >> 16); } -static inline uint64_t swap64(uint64_t x) { +static inline uint64_t swap64(uint64_t x) +{ x = ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8); x = ((x & 0x0000ffff0000ffff) << 16) | ((x & 0xffff0000ffff0000) >> 16); return (x << 32) | (x >> 32); @@ -99,39 +112,51 @@ static inline uint64_t swap64(uint64_t x) { #else #define UNUSED #endif -static inline void mem_inplace_ident(void *mem UNUSED, size_t n UNUSED) { } +static inline void mem_inplace_ident(void* mem UNUSED, size_t n UNUSED) +{ +} #undef UNUSED -static inline void mem_inplace_swap32(void *mem, size_t n) { +static inline void mem_inplace_swap32(void* mem, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint32_t *)mem)[i] = swap32(((const uint32_t *)mem)[i]); + for(i = 0; i < n; i++) + { + ((uint32_t*)mem)[i] = swap32(((const uint32_t*)mem)[i]); } } -static inline void mem_inplace_swap64(void *mem, size_t n) { +static inline void mem_inplace_swap64(void* mem, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint64_t *)mem)[i] = swap64(((const uint64_t *)mem)[i]); + for(i = 0; i < n; i++) + { + ((uint64_t*)mem)[i] = swap64(((const uint64_t*)mem)[i]); } } -static inline void memcpy_ident32(void *dst, const void *src, size_t n) { +static inline void memcpy_ident32(void* dst, const void* src, size_t n) +{ memcpy(dst, src, 4 * n); } -static inline void memcpy_ident64(void *dst, const void *src, size_t n) { +static inline void memcpy_ident64(void* dst, const void* src, size_t n) +{ memcpy(dst, src, 8 * n); } -static inline void memcpy_swap32(void *dst, const void *src, size_t n) { +static inline void memcpy_swap32(void* dst, const void* src, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint32_t *)dst)[i] = swap32(((const uint32_t *)src)[i]); + for(i = 0; i < n; i++) + { + ((uint32_t*)dst)[i] = swap32(((const uint32_t*)src)[i]); } } -static inline void memcpy_swap64(void *dst, const void *src, size_t n) { +static inline void memcpy_swap64(void* dst, const void* src, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint64_t *)dst)[i] = swap64(((const uint64_t *)src)[i]); + for(i = 0; i < n; i++) + { + ((uint64_t*)dst)[i] = swap64(((const uint64_t*)src)[i]); } } diff --git a/xmrstak/backend/cpu/crypto/skein_port.h b/xmrstak/backend/cpu/crypto/skein_port.h index 99641bcdf..1648cdc7d 100644 --- a/xmrstak/backend/cpu/crypto/skein_port.h +++ b/xmrstak/backend/cpu/crypto/skein_port.h @@ -2,38 +2,38 @@ #define _SKEIN_PORT_H_ #include -#include #include +#include #ifndef RETURN_VALUES -# define RETURN_VALUES -# if defined( DLL_EXPORT ) -# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) -# define VOID_RETURN __declspec( dllexport ) void __stdcall -# define INT_RETURN __declspec( dllexport ) int __stdcall -# elif defined( __GNUC__ ) -# define VOID_RETURN __declspec( __dllexport__ ) void -# define INT_RETURN __declspec( __dllexport__ ) int -# else -# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers -# endif -# elif defined( DLL_IMPORT ) -# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) -# define VOID_RETURN __declspec( dllimport ) void __stdcall -# define INT_RETURN __declspec( dllimport ) int __stdcall -# elif defined( __GNUC__ ) -# define VOID_RETURN __declspec( __dllimport__ ) void -# define INT_RETURN __declspec( __dllimport__ ) int -# else -# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers -# endif -# elif defined( __WATCOMC__ ) -# define VOID_RETURN void __cdecl -# define INT_RETURN int __cdecl -# else -# define VOID_RETURN void -# define INT_RETURN int -# endif +#define RETURN_VALUES +#if defined(DLL_EXPORT) +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#define VOID_RETURN __declspec(dllexport) void __stdcall +#define INT_RETURN __declspec(dllexport) int __stdcall +#elif defined(__GNUC__) +#define VOID_RETURN __declspec(__dllexport__) void +#define INT_RETURN __declspec(__dllexport__) int +#else +#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +#endif +#elif defined(DLL_IMPORT) +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#define VOID_RETURN __declspec(dllimport) void __stdcall +#define INT_RETURN __declspec(dllimport) int __stdcall +#elif defined(__GNUC__) +#define VOID_RETURN __declspec(__dllimport__) void +#define INT_RETURN __declspec(__dllimport__) int +#else +#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +#endif +#elif defined(__WATCOMC__) +#define VOID_RETURN void __cdecl +#define INT_RETURN int __cdecl +#else +#define VOID_RETURN void +#define INT_RETURN int +#endif #endif /* These defines are used to declare buffers in a way that allows @@ -52,17 +52,17 @@ variable of length 'size' bits */ -#define ui_type(size) uint##size##_t -#define dec_unit_type(size,x) typedef ui_type(size) x -#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)] -#define ptr_cast(x,size) ((ui_type(size)*)(x)) +#define ui_type(size) uint##size##_t +#define dec_unit_type(size, x) typedef ui_type(size) x +#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)] +#define ptr_cast(x, size) ((ui_type(size)*)(x)) -typedef unsigned int uint_t; /* native unsigned integer */ -typedef uint8_t u08b_t; /* 8-bit unsigned integer */ -typedef uint64_t u64b_t; /* 64-bit unsigned integer */ +typedef unsigned int uint_t; /* native unsigned integer */ +typedef uint8_t u08b_t; /* 8-bit unsigned integer */ +typedef uint64_t u64b_t; /* 64-bit unsigned integer */ #ifndef RotL_64 -#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N)))) +#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64 - (N)))) #endif /* @@ -91,26 +91,25 @@ typedef uint64_t u64b_t; /* 64-bit unsigned integer */ /* special handler for IA64, which may be either endianness (?) */ /* here we assume little-endian, but this may need to be changed */ #if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) -# define PLATFORM_MUST_ALIGN (1) +#define PLATFORM_MUST_ALIGN (1) #ifndef PLATFORM_BYTE_ORDER -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN #endif #endif -#ifndef PLATFORM_MUST_ALIGN -# define PLATFORM_MUST_ALIGN (0) +#ifndef PLATFORM_MUST_ALIGN +#define PLATFORM_MUST_ALIGN (0) #endif - -#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN - /* here for big-endian CPUs */ -#define SKEIN_NEED_SWAP (1) +#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN +/* here for big-endian CPUs */ +#define SKEIN_NEED_SWAP (1) #elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN - /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ -#define SKEIN_NEED_SWAP (0) -#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ -#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt) -#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt)) +/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ +#define SKEIN_NEED_SWAP (0) +#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ +#define Skein_Put64_LSB_First(dst08, src64, bCnt) memcpy(dst08, src64, bCnt) +#define Skein_Get64_LSB_First(dst64, src08, wCnt) memcpy(dst64, src08, 8 * (wCnt)) #endif #else #error "Skein needs endianness setting!" @@ -123,57 +122,55 @@ typedef uint64_t u64b_t; /* 64-bit unsigned integer */ * Provide any definitions still needed. ****************************************************************** */ -#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ -#if SKEIN_NEED_SWAP -#define Skein_Swap64(w64) \ - ( (( ((u64b_t)(w64)) & 0xFF) << 56) | \ - (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \ - (((((u64b_t)(w64)) >>16) & 0xFF) << 40) | \ - (((((u64b_t)(w64)) >>24) & 0xFF) << 32) | \ - (((((u64b_t)(w64)) >>32) & 0xFF) << 24) | \ - (((((u64b_t)(w64)) >>40) & 0xFF) << 16) | \ - (((((u64b_t)(w64)) >>48) & 0xFF) << 8) | \ - (((((u64b_t)(w64)) >>56) & 0xFF) ) ) +#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ +#if SKEIN_NEED_SWAP +#define Skein_Swap64(w64) \ + (((((u64b_t)(w64)) & 0xFF) << 56) | \ + (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \ + (((((u64b_t)(w64)) >> 16) & 0xFF) << 40) | \ + (((((u64b_t)(w64)) >> 24) & 0xFF) << 32) | \ + (((((u64b_t)(w64)) >> 32) & 0xFF) << 24) | \ + (((((u64b_t)(w64)) >> 40) & 0xFF) << 16) | \ + (((((u64b_t)(w64)) >> 48) & 0xFF) << 8) | \ + (((((u64b_t)(w64)) >> 56) & 0xFF))) #else -#define Skein_Swap64(w64) (w64) +#define Skein_Swap64(w64) (w64) #endif -#endif /* ifndef Skein_Swap64 */ - +#endif /* ifndef Skein_Swap64 */ #ifndef Skein_Put64_LSB_First -void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) -#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ - { /* this version is fully portable (big-endian or little-endian), but slow */ +void Skein_Put64_LSB_First(u08b_t* dst, const u64b_t* src, size_t bCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ +{ /* this version is fully portable (big-endian or little-endian), but slow */ size_t n; - for (n=0;n>3] >> (8*(n&7))); - } + for(n = 0; n < bCnt; n++) + dst[n] = (u08b_t)(src[n >> 3] >> (8 * (n & 7))); +} #else - ; /* output only the function prototype */ + ; /* output only the function prototype */ #endif -#endif /* ifndef Skein_Put64_LSB_First */ - +#endif /* ifndef Skein_Put64_LSB_First */ #ifndef Skein_Get64_LSB_First -void Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt) -#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ - { /* this version is fully portable (big-endian or little-endian), but slow */ +void Skein_Get64_LSB_First(u64b_t* dst, const u08b_t* src, size_t wCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ +{ /* this version is fully portable (big-endian or little-endian), but slow */ size_t n; - for (n=0;n<8*wCnt;n+=8) - dst[n/8] = (((u64b_t) src[n ]) ) + - (((u64b_t) src[n+1]) << 8) + - (((u64b_t) src[n+2]) << 16) + - (((u64b_t) src[n+3]) << 24) + - (((u64b_t) src[n+4]) << 32) + - (((u64b_t) src[n+5]) << 40) + - (((u64b_t) src[n+6]) << 48) + - (((u64b_t) src[n+7]) << 56) ; - } + for(n = 0; n < 8 * wCnt; n += 8) + dst[n / 8] = (((u64b_t)src[n])) + + (((u64b_t)src[n + 1]) << 8) + + (((u64b_t)src[n + 2]) << 16) + + (((u64b_t)src[n + 3]) << 24) + + (((u64b_t)src[n + 4]) << 32) + + (((u64b_t)src[n + 5]) << 40) + + (((u64b_t)src[n + 6]) << 48) + + (((u64b_t)src[n + 7]) << 56); +} #else - ; /* output only the function prototype */ + ; /* output only the function prototype */ #endif -#endif /* ifndef Skein_Get64_LSB_First */ +#endif /* ifndef Skein_Get64_LSB_First */ -#endif /* ifndef _SKEIN_PORT_H_ */ +#endif /* ifndef _SKEIN_PORT_H_ */ diff --git a/xmrstak/backend/cpu/crypto/soft_aes.hpp b/xmrstak/backend/cpu/crypto/soft_aes.hpp index 9b4ae0ab5..3ea75c5e6 100644 --- a/xmrstak/backend/cpu/crypto/soft_aes.hpp +++ b/xmrstak/backend/cpu/crypto/soft_aes.hpp @@ -34,56 +34,58 @@ #include -#define saes_data(w) {\ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ - w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ - w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ - w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ - w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ - w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ - w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ - w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ - w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ - w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ - w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ - w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ - w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ - w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ - w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ - w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ - w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ - w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ - w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ - w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ - w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ - w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ - w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ - w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ - w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ - w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ - w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ - w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } +#define saes_data(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5), \ + w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76), \ + w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0), \ + w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0), \ + w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), \ + w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), \ + w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75), \ + w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0), \ + w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84), \ + w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b), \ + w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), \ + w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), \ + w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5), \ + w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2), \ + w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17), \ + w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73), \ + w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), \ + w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), \ + w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79), \ + w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9), \ + w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08), \ + w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6), \ + w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), \ + w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), \ + w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94), \ + w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf), \ + w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68), \ + w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } -#define SAES_WPOLY 0x011b +#define SAES_WPOLY 0x011b #define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ - ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) + ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) -#define saes_f2(x) ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY)) -#define saes_f3(x) (saes_f2(x) ^ x) -#define saes_h0(x) (x) +#define saes_f2(x) ((x << 1) ^ (((x >> 7) & 1) * SAES_WPOLY)) +#define saes_f3(x) (saes_f2(x) ^ x) +#define saes_h0(x) (x) -#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) -#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) -#define saes_u2(p) saes_b2w( p, saes_f3(p), saes_f2(p), p) -#define saes_u3(p) saes_b2w( p, p, saes_f3(p), saes_f2(p)) +#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) +#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) +#define saes_u2(p) saes_b2w(p, saes_f3(p), saes_f2(p), p) +#define saes_u3(p) saes_b2w(p, p, saes_f3(p), saes_f2(p)) -alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; -alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0); +alignas(16) const uint32_t saes_table[4][256] = {saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3)}; +alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0); static inline __m128i soft_aesenc(__m128i in, __m128i key) { @@ -104,10 +106,10 @@ static inline __m128i soft_aesenc(__m128i in, __m128i key) static inline uint32_t sub_word(uint32_t key) { - return (saes_sbox[key >> 24 ] << 24) | - (saes_sbox[(key >> 16) & 0xff] << 16 ) | - (saes_sbox[(key >> 8) & 0xff] << 8 ) | - saes_sbox[key & 0xff]; + return (saes_sbox[key >> 24] << 24) | + (saes_sbox[(key >> 16) & 0xff] << 16) | + (saes_sbox[(key >> 8) & 0xff] << 8) | + saes_sbox[key & 0xff]; } #ifdef __clang__ @@ -121,5 +123,5 @@ static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) { uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55))); uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF))); - return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1); + return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1); } diff --git a/xmrstak/backend/cpu/crypto/variant4_random_math.h b/xmrstak/backend/cpu/crypto/variant4_random_math.h index 50228adf2..9fe61db51 100644 --- a/xmrstak/backend/cpu/crypto/variant4_random_math.h +++ b/xmrstak/backend/cpu/crypto/variant4_random_math.h @@ -1,12 +1,12 @@ #pragma once -#include #include "../../cryptonight.hpp" #include "xmrstak/misc/console.hpp" +#include extern "C" { - #include "c_blake256.h" +#include "c_blake256.h" } enum V4_Settings @@ -31,13 +31,13 @@ enum V4_Settings enum V4_InstructionList { - MUL, // a*b - ADD, // a+b + C, C is an unsigned 32-bit constant - SUB, // a-b - ROR, // rotate right "a" by "b & 31" bits - ROL, // rotate left "a" by "b & 31" bits - XOR, // a^b - RET, // finish execution + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution V4_INSTRUCTION_COUNT = RET, }; @@ -87,7 +87,7 @@ struct V4_Instruction // every switch-case will point to the same destination on every iteration of Cryptonight main loop // // This is about as fast as it can get without using low-level machine code generation -template +template static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) { enum @@ -95,55 +95,55 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) REG_BITS = sizeof(v4_reg) * 8, }; -#define V4_EXEC(i) \ - { \ - const struct V4_Instruction* op = code + i; \ - const v4_reg src = r[op->src_index]; \ - v4_reg* dst = r + op->dst_index; \ - switch (op->opcode) \ - { \ - case MUL: \ - *dst *= src; \ - break; \ - case ADD: \ - *dst += src + op->C; \ - break; \ - case SUB: \ - *dst -= src; \ - break; \ - case ROR: \ - { \ - const uint32_t shift = src % REG_BITS; \ - *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ - } \ - break; \ - case ROL: \ - { \ - const uint32_t shift = src % REG_BITS; \ - *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ - } \ - break; \ - case XOR: \ - *dst ^= src; \ - break; \ - case RET: \ - return; \ - default: \ - UNREACHABLE_CODE; \ - break; \ - } \ +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch(op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ } #define V4_EXEC_10(j) \ - V4_EXEC(j + 0) \ - V4_EXEC(j + 1) \ - V4_EXEC(j + 2) \ - V4_EXEC(j + 3) \ - V4_EXEC(j + 4) \ - V4_EXEC(j + 5) \ - V4_EXEC(j + 6) \ - V4_EXEC(j + 7) \ - V4_EXEC(j + 8) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ V4_EXEC(j + 9) // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency @@ -161,13 +161,13 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) // 69 102 // Unroll 70 instructions here - V4_EXEC_10(0); // instructions 0-9 - V4_EXEC_10(10); // instructions 10-19 - V4_EXEC_10(20); // instructions 20-29 - V4_EXEC_10(30); // instructions 30-39 - V4_EXEC_10(40); // instructions 40-49 - V4_EXEC_10(50); // instructions 50-59 - V4_EXEC_10(60); // instructions 60-69 + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 #undef V4_EXEC_10 #undef V4_EXEC @@ -176,7 +176,7 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) // If we don't have enough data available, generate more static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) { - if (*data_index + bytes_needed > data_size) + if(*data_index + bytes_needed > data_size) { blake256_hash((uint8_t*)data, (uint8_t*)data, data_size); *data_index = 0; @@ -188,7 +188,7 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed // Generates as many random math operations as possible with given latency and ALU restrictions // "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions -template +template static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) { printer::inst()->print_msg(LDEBUG, "CryptonightR create random math for block %llu", height); @@ -199,13 +199,13 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same // Source: https://www.agner.org/optimize/instruction_tables.pdf - const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + const int op_latency[V4_INSTRUCTION_COUNT] = {3, 2, 1, 2, 2, 1}; // Instruction latencies for theoretical ASIC implementation - const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + const int asic_op_latency[V4_INSTRUCTION_COUNT] = {3, 1, 1, 1, 1, 1}; // Available ALUs for each instruction - const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + const int op_ALUs[V4_INSTRUCTION_COUNT] = {ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT}; int8_t data[32]; memset(data, 0, sizeof(data)); @@ -226,7 +226,8 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // There is a small chance (1.8%) that register R8 won't be used in the generated program // So we keep track of it and try again if it's not used bool r8_used; - do { + do + { int latency[9]; int asic_latency[9]; @@ -237,7 +238,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // // Registers R4-R8 are constant and are treated as having the same value because when we do // the same operation twice with two constant source registers, it can be optimized into a single operation - uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + uint32_t inst_data[9] = {0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF}; bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; bool is_rotation[V4_INSTRUCTION_COUNT]; @@ -260,11 +261,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Generate random code to achieve minimal required latency for our abstract CPU // Try to get this latency for all 4 registers - while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + while(((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) { // Fail-safe to guarantee loop termination ++total_iterations; - if (total_iterations > 256) + if(total_iterations > 256) break; check_data(&data_index, 1, data, sizeof(data)); @@ -277,12 +278,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // ROR/ROL = opcode 5, shift direction is selected randomly // XOR = opcodes 6-7 uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); - if (opcode == 5) + if(opcode == 5) { check_data(&data_index, 1, data, sizeof(data)); opcode = (data[data_index++] >= 0) ? ROR : ROL; } - else if (opcode >= 6) + else if(opcode >= 6) { opcode = XOR; } @@ -298,7 +299,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh int b = src_index; // Don't do ADD/SUB/XOR with the same register - if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + if(((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) { // a is always < 4, so we don't need to check bounds here b = (ALGO == cryptonight_r_wow) ? (a + 4) : 8; @@ -306,7 +307,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh } // Don't do rotation with the same destination twice because it's equal to a single rotation - if (is_rotation[opcode] && rotated[a]) + if(is_rotation[opcode] && rotated[a]) { continue; } @@ -314,7 +315,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations // 2xXOR(a, b) = NOP - if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + if((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) { continue; } @@ -322,20 +323,20 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Find which ALU is available (and when) for this instruction int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; int alu_index = -1; - while (next_latency < TOTAL_LATENCY) + while(next_latency < TOTAL_LATENCY) { - for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + for(int i = op_ALUs[opcode] - 1; i >= 0; --i) { - if (!alu_busy[next_latency][i]) + if(!alu_busy[next_latency][i]) { // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check - if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + if((opcode == ADD) && alu_busy[next_latency + 1][i]) { continue; } // Rotation can only start when previous rotation is finished, so do an additional availability check - if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + if(is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) { continue; } @@ -344,7 +345,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh break; } } - if (alu_index >= 0) + if(alu_index >= 0) { break; } @@ -352,16 +353,16 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh } // Don't generate instructions that leave some register unchanged for more than 7 cycles - if (next_latency > latency[a] + 7) + if(next_latency > latency[a] + 7) { continue; } next_latency += op_latency[opcode]; - if (next_latency <= TOTAL_LATENCY) + if(next_latency <= TOTAL_LATENCY) { - if (is_rotation[opcode]) + if(is_rotation[opcode]) { ++rotate_count; } @@ -382,12 +383,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh code[code_size].src_index = src_index; code[code_size].C = 0; - if (src_index == 8) + if(src_index == 8) { r8_used = true; } - if (opcode == ADD) + if(opcode == ADD) { // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; @@ -401,7 +402,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh } ++code_size; - if (code_size >= NUM_INSTRUCTIONS_MIN) + if(code_size >= NUM_INSTRUCTIONS_MIN) { break; } @@ -416,17 +417,19 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC // Get this latency for at least 1 of the 4 registers const int prev_code_size = code_size; - while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + while((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) { int min_idx = 0; int max_idx = 0; - for (int i = 1; i < 4; ++i) + for(int i = 1; i < 4; ++i) { - if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; - if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + if(asic_latency[i] < asic_latency[min_idx]) + min_idx = i; + if(asic_latency[i] > asic_latency[max_idx]) + max_idx = i; } - const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t pattern[3] = {ROR, MUL, MUL}; const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; latency[min_idx] = latency[max_idx] + op_latency[opcode]; asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; @@ -438,9 +441,9 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh ++code_size; } - // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time - // It never does more than 4 iterations for all block heights < 10,000,000 - } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while(!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here // Add final instruction to stop the interpreter diff --git a/xmrstak/backend/cpu/hwlocMemory.cpp b/xmrstak/backend/cpu/hwlocMemory.cpp index 089570fc0..067f27975 100644 --- a/xmrstak/backend/cpu/hwlocMemory.cpp +++ b/xmrstak/backend/cpu/hwlocMemory.cpp @@ -6,6 +6,24 @@ #include +static __hwloc_inline int +xmrstak_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) +{ +#if HWLOC_API_VERSION >= 0x20000 + return hwloc_set_membind( + topology, + nodeset, + policy, + flags| HWLOC_MEMBIND_BYNODESET); +#else + return hwloc_set_membind_nodeset( + topology, + nodeset, + policy, + flags); +#endif +} + /** pin memory to NUMA node * * Set the default memory policy for the current thread to bind memory to the @@ -13,7 +31,7 @@ * * @param puId core id */ -void bindMemoryToNUMANode( size_t puId ) +void bindMemoryToNUMANode(size_t puId) { int depth; hwloc_topology_t topology; @@ -30,18 +48,18 @@ void bindMemoryToNUMANode( size_t puId ) depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); - for( uint32_t i = 0; + for(uint32_t i = 0; i < hwloc_get_nbobjs_by_depth(topology, depth); - i++ ) + i++) { hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i); - if( pu->os_index == puId ) + if(pu->os_index == puId) { - if( 0 > hwloc_set_membind_nodeset( - topology, - pu->nodeset, - HWLOC_MEMBIND_BIND, - HWLOC_MEMBIND_THREAD)) + if(0 > xmrstak_set_membind_nodeset( + topology, + pu->nodeset, + HWLOC_MEMBIND_BIND, + HWLOC_MEMBIND_THREAD)) { printer::inst()->print_msg(L0, "hwloc: can't bind memory"); } @@ -57,7 +75,7 @@ void bindMemoryToNUMANode( size_t puId ) } #else -void bindMemoryToNUMANode( size_t ) +void bindMemoryToNUMANode(size_t) { } diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp index 2130c2ced..42fa3456f 100644 --- a/xmrstak/backend/cpu/hwlocMemory.hpp +++ b/xmrstak/backend/cpu/hwlocMemory.hpp @@ -9,4 +9,4 @@ * * @param puId core id */ -void bindMemoryToNUMANode( size_t puId ); +void bindMemoryToNUMANode(size_t puId); diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index a14be1732..a7bb91d61 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -37,7 +37,6 @@ #include #endif - namespace xmrstak { namespace cpu @@ -48,9 +47,14 @@ using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { aCpuThreadsConf, sUseSlowMem }; +enum configEnum +{ + aCpuThreadsConf, + sUseSlowMem +}; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -59,10 +63,9 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aCpuThreadsConf, "cpu_threads_conf", kNullType } -}; + {aCpuThreadsConf, "cpu_threads_conf", kNullType}}; -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); inline bool checkType(Type have, Type want) { @@ -95,7 +98,7 @@ jconf::jconf() prv = new opaque_private(); } -bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) +bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg) { if(!prv->configValues[aCpuThreadsConf]->IsArray()) return false; @@ -148,7 +151,6 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return true; } - size_t jconf::GetThreadCount() { if(prv->configValues[aCpuThreadsConf]->IsArray()) @@ -159,22 +161,22 @@ size_t jconf::GetThreadCount() bool jconf::parse_config(const char* sFilename) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -189,7 +191,7 @@ bool jconf::parse_config(const char* sFilename) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -211,7 +213,7 @@ bool jconf::parse_config(const char* sFilename) buffer[flen] = '}'; buffer[flen + 1] = '\0'; - prv->jsonDoc.Parse(buffer, flen+2); + prv->jsonDoc.Parse(buffer, flen + 2); free(buffer); if(prv->jsonDoc.HasParseError()) @@ -251,7 +253,7 @@ bool jconf::parse_config(const char* sFilename) } thd_cfg c; - for(size_t i=0; i < GetThreadCount(); i++) + for(size_t i = 0; i < GetThreadCount(); i++) { if(!GetThreadConfig(i, c)) { diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index 4ec9165d5..67dbd0275 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -12,16 +12,18 @@ namespace cpu class jconf { -public: + public: static jconf* inst() { - if (oInst == nullptr) oInst = new jconf; + if(oInst == nullptr) + oInst = new jconf; return oInst; }; bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str()); - struct thd_cfg { + struct thd_cfg + { int iMultiway; bool bNoPrefetch; std::string asm_version_str; @@ -29,10 +31,10 @@ class jconf }; size_t GetThreadCount(); - bool GetThreadConfig(size_t id, thd_cfg &cfg); + bool GetThreadConfig(size_t id, thd_cfg& cfg); bool NeedsAutoconf(); -private: + private: jconf(); static jconf* oInst; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index e90b59500..43759776f 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -23,33 +23,34 @@ #include "crypto/cryptonight_aesni.h" -#include "xmrstak/misc/console.hpp" -#include "xmrstak/backend/iBackend.hpp" +#include "jconf.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" #include "xmrstak/backend/globalStates.hpp" +#include "xmrstak/backend/iBackend.hpp" #include "xmrstak/misc/configEditor.hpp" -#include "xmrstak/backend/cpu/cpuType.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include "jconf.hpp" -#include "xmrstak/misc/executor.hpp" #include "minethd.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/executor.hpp" #include "hwlocMemory.hpp" #include "xmrstak/backend/miner_work.hpp" #ifndef CONF_NO_HWLOC -# include "autoAdjustHwloc.hpp" +#include "autoAdjustHwloc.hpp" +#include "autoAdjust.hpp" #else -# include "autoAdjust.hpp" +#include "autoAdjust.hpp" #endif #include -#include +#include #include +#include #include #include -#include #include #ifdef _WIN32 @@ -58,9 +59,9 @@ #include #if defined(__APPLE__) -#include #include -#define SYSCTL_CORE_COUNT "machdep.cpu.core_count" +#include +#define SYSCTL_CORE_COUNT "machdep.cpu.core_count" #elif defined(__FreeBSD__) #include #endif //__APPLE__ @@ -87,7 +88,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id } #elif defined(__APPLE__) thread_port_t mach_thread; - thread_affinity_policy_data_t policy = { static_cast(cpu_id) }; + thread_affinity_policy_data_t policy = {static_cast(cpu_id)}; mach_thread = pthread_mach_thread_np(h); return thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1) == KERN_SUCCESS; #elif defined(__FreeBSD__) @@ -96,8 +97,8 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id CPU_SET(cpu_id, &mn); return pthread_setaffinity_np(h, sizeof(cpuset_t), &mn) == 0; #elif defined(__OpenBSD__) - printer::inst()->print_msg(L0,"WARNING: thread pinning is not supported under OPENBSD."); - return true; + printer::inst()->print_msg(L0, "WARNING: thread pinning is not supported under OPENBSD."); + return true; #else cpu_set_t mn; CPU_ZERO(&mn); @@ -120,7 +121,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); - switch (iMultiway) + switch(iMultiway) { case 5: oWorkThd = std::thread(&minethd::penta_work_main, this); @@ -150,13 +151,13 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, cryptonight_ctx* minethd::minethd_alloc_ctx() { cryptonight_ctx* ctx; - alloc_msg msg = { 0 }; + alloc_msg msg = {0}; - switch (::jconf::inst()->GetSlowMemSetting()) + switch(::jconf::inst()->GetSlowMemSetting()) { case ::jconf::never_use: ctx = cryptonight_alloc_ctx(1, 1, &msg); - if (ctx == NULL) + if(ctx == NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); else { @@ -170,7 +171,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() case ::jconf::no_mlck: ctx = cryptonight_alloc_ctx(1, 0, &msg); - if (ctx == NULL) + if(ctx == NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); else { @@ -184,12 +185,12 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() case ::jconf::print_warning: ctx = cryptonight_alloc_ctx(1, 1, &msg); - if (msg.warning != NULL) + if(msg.warning != NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); - if (ctx == NULL) + if(ctx == NULL) ctx = cryptonight_alloc_ctx(0, 0, NULL); - if (ctx != NULL) + if(ctx != NULL) { ctx->hash_fn = nullptr; ctx->loop_fn = nullptr; @@ -220,11 +221,11 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() static constexpr size_t MAX_N = 5; bool minethd::self_test() { - alloc_msg msg = { 0 }; + alloc_msg msg = {0}; size_t res; bool fatal = false; - switch (::jconf::inst()->GetSlowMemSetting()) + switch(::jconf::inst()->GetSlowMemSetting()) { case ::jconf::never_use: res = cryptonight_init(1, 1, &msg); @@ -255,13 +256,13 @@ bool minethd::self_test() if(res == 0 && fatal) return false; - cryptonight_ctx *ctx[MAX_N] = {0}; - for (int i = 0; i < MAX_N; i++) + cryptonight_ctx* ctx[MAX_N] = {0}; + for(int i = 0; i < MAX_N; i++) { - if ((ctx[i] = minethd_alloc_ctx()) == nullptr) + if((ctx[i] = minethd_alloc_ctx()) == nullptr) { printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); - for (int j = 0; j < i; j++) + for(int j = 0; j < i; j++) cryptonight_free_ctx(ctx[j]); return false; } @@ -279,63 +280,68 @@ bool minethd::self_test() { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; minethd::cn_on_new_job dm; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo); - bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", + 64) == 0; func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo); - bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", + 64) == 0; func_multi_selector<3>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a testThis is a testThis is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", + 96) == 0; func_multi_selector<4>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", + 128) == 0; func_multi_selector<5>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", + 160) == 0; } else if(algo == POW(cryptonight_lite)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; } else if(algo == POW(cryptonight_monero)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } else if(algo == POW(cryptonight_monero_v8)) { @@ -351,61 +357,61 @@ bool minethd::self_test() { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; } else if(algo == POW(cryptonight_ipbc)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; } else if(algo == POW(cryptonight_stellite)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; } else if(algo == POW(cryptonight_masari)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; } else if(algo == POW(cryptonight_heavy)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; } else if(algo == POW(cryptonight_haven)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; } else if(algo == POW(cryptonight_bittube2)) { @@ -415,7 +421,7 @@ bool minethd::self_test() func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx, algo); - bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; + bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; ctx[0]->hash_fn("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx, algo); bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; @@ -427,29 +433,29 @@ bool minethd::self_test() { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx, algo); - bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; + bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; } else if(algo == POW(cryptonight_gpu)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; } else if(algo == POW(cryptonight_conceal)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; + bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; + bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; } - else if (algo == POW(cryptonight_turtle)) + else if(algo == POW(cryptonight_turtle)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); @@ -467,7 +473,7 @@ bool minethd::self_test() work.iBlockHeight = 1806260; set_job(work, ctx); ctx[0]->hash_fn("\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0; + bResult = bResult && memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0; } else if(algo == POW(cryptonight_v8_reversewaltz)) { @@ -498,7 +504,7 @@ bool minethd::self_test() "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); } - for (int i = 0; i < MAX_N; i++) + for(int i = 0; i < MAX_N; i++) cryptonight_free_ctx(ctx[i]); return bResult; @@ -510,9 +516,23 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work if(!configEditor::file_exist(params::inst().configFileCPU)) { +#ifndef CONF_NO_HWLOC + autoAdjustHwloc adjustHwloc; + if(!adjustHwloc.printConfig()) + { + autoAdjust adjust; + if(!adjust.printConfig()) + { + return pvThreads; + } + } +#else autoAdjust adjust; if(!adjust.printConfig()) + { return pvThreads; + } +#endif } if(!jconf::inst()->parse_config()) @@ -520,14 +540,13 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work win_exit(); } - //Launch the requested number of single and double threads, to distribute //load evenly we need to alternate single and double threads size_t i, n = jconf::inst()->GetThreadCount(); pvThreads.reserve(n); jconf::thd_cfg cfg; - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { jconf::inst()->GetThreadConfig(i, cfg); @@ -572,11 +591,11 @@ static std::string getAsmName(const uint32_t num_hashes) return asm_type; } -template +template void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& on_new_job, bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str) { - static_assert(N >= 1, "number of threads must be >= 1" ); + static_assert(N >= 1, "number of threads must be >= 1"); // We have two independent flag bits in the functions // therefore we will build a binary digit and select the @@ -717,21 +736,20 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash - }; + Cryptonight_hash::template hash}; std::bitset<2> digit; digit.set(0, !bHaveAes); digit.set(1, !bNoPrefetch); - ctx[0]->hash_fn = func_table[ algv << 2 | digit.to_ulong() ]; + ctx[0]->hash_fn = func_table[algv << 2 | digit.to_ulong()]; // check for asm optimized version for cryptonight_v8 if(algo == cryptonight_monero_v8) { std::string selected_asm = asm_version_str; if(selected_asm == "auto") - selected_asm = cpu::getAsmName(N); + selected_asm = cpu::getAsmName(N); if(selected_asm != "off") { @@ -747,7 +765,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& { std::string selected_asm = asm_version_str; if(selected_asm == "auto") - selected_asm = cpu::getAsmName(N); + selected_asm = cpu::getAsmName(N); if(selected_asm == "off") { for(int h = 0; h < N; ++h) @@ -769,7 +787,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& }; auto it = on_new_job_map.find(algo.Id()); - if (it != on_new_job_map.end()) + if(it != on_new_job_map.end()) on_new_job = it->second; else on_new_job = nullptr; @@ -806,18 +824,18 @@ void minethd::penta_work_main() multiway_work_main<5u>(); } -template -void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce) +template +void minethd::prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce) { - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) { memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize); - if (i > 0) + if(i > 0) piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39); } } -template +template void minethd::multiway_work_main() { if(affinity >= 0) //-1 means no affinity @@ -825,25 +843,26 @@ void minethd::multiway_work_main() order_fix.set_value(); std::unique_lock lck(thd_aff_set); - lck.release(); + lck.unlock(); std::this_thread::yield(); - cryptonight_ctx *ctx[MAX_N]; + cryptonight_ctx* ctx[MAX_N]; uint64_t iCount = 0; - uint64_t *piHashVal[MAX_N]; - uint32_t *piNonce[MAX_N]; + uint64_t iLastCount = 0; + uint64_t* piHashVal[MAX_N]; + uint32_t* piNonce[MAX_N]; uint8_t bHashOut[MAX_N * 32]; uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N]; uint32_t iNonce; job_result res; - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) { ctx[i] = minethd_alloc_ctx(); if(ctx[i] == nullptr) { printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); - for (int j = 0; j < i; j++) + for(int j = 0; j < i; j++) cryptonight_free_ctx(ctx[j]); win_exit(1); } @@ -863,15 +882,15 @@ void minethd::multiway_work_main() size_t lastPoolId = 0; func_multi_selector(ctx, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); - while (bQuit == 0) + while(bQuit == 0) { - if (oWork.bStall) + if(oWork.bStall) { /* We are stalled here because the executor didn't find a job for us yet, either because of network latency, or a socket problem. Since we are raison d'etre of this software it us sensible to just wait until we have something*/ - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); globalStates::inst().consume_work(oWork, iJobNo); @@ -908,13 +927,12 @@ void minethd::multiway_work_main() if(on_new_job != nullptr) on_new_job(oWork, ctx); - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { - if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes + if((iCount++ & 0x7) == 0) //Store stats every 8*N hashes { - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount * N, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); + updateStats((iCount - iLastCount) * N, oWork.iPoolId); + iLastCount = iCount; } nonce_ctr -= N; @@ -927,19 +945,18 @@ void minethd::multiway_work_main() break; } - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) *piNonce[i] = iNonce++; ctx[0]->hash_fn(bWorkBlob, oWork.iWorkSize, bHashOut, ctx, miner_algo); - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) { - if (*piHashVal[i] < oWork.iTarget) + if(*piHashVal[i] < oWork.iTarget) { executor::inst()->push_event( ex_event(job_result(oWork.sJobID, iNonce - N + i, bHashOut + 32 * i, iThreadNo, miner_algo), - oWork.iPoolId) - ); + oWork.iPoolId)); } } @@ -950,7 +967,7 @@ void minethd::multiway_work_main() prep_multiway_work(bWorkBlob, piNonce); } - for (int i = 0; i < N; i++) + for(int i = 0; i < N; i++) cryptonight_free_ctx(ctx[i]); } diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 1e25f5d4f..a5201f37a 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -1,15 +1,15 @@ #pragma once -#include "xmrstak/jconf.hpp" #include "crypto/cryptonight.h" -#include "xmrstak/backend/miner_work.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/jconf.hpp" +#include +#include #include #include #include -#include -#include namespace xmrstak { @@ -18,7 +18,7 @@ namespace cpu class minethd : public iBackend { -public: + public: static std::vector thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); @@ -29,19 +29,18 @@ class minethd : public iBackend static cryptonight_ctx* minethd_alloc_ctx(); - template + template static void func_multi_selector(cryptonight_ctx**, minethd::cn_on_new_job& on_new_job, - bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off"); + bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off"); - private: - + private: minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); - template + template void multiway_work_main(); - template - void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce); + template + void prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce); void work_main(); void double_work_main(); diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index e58665922..262865ea0 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -1,9 +1,9 @@ #pragma once -#include +#include #include -#include +#include #include -#include +#include constexpr size_t start_derived_algo_id = 1000; @@ -15,10 +15,10 @@ enum xmrstak_algo_id cryptonight_monero = 3, cryptonight_heavy = 4, cryptonight_aeon = 5, - cryptonight_ipbc = 6, // equal to cryptonight_aeon with a small tweak in the miner code - cryptonight_stellite = 7, //equal to cryptonight_monero but with one tiny change - cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari - cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak + cryptonight_ipbc = 6, // equal to cryptonight_aeon with a small tweak in the miner code + cryptonight_stellite = 7, //equal to cryptonight_monero but with one tiny change + cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari + cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks cryptonight_monero_v8 = 11, cryptonight_superfast = 12, @@ -42,35 +42,32 @@ enum xmrstak_algo_id inline std::string get_algo_name(xmrstak_algo_id algo_id) { static std::array base_algo_names = - {{ - "invalid_algo", - "cryptonight", - "cryptonight_lite", - "cryptonight_v7", - "cryptonight_heavy", - "cryptonight_lite_v7", - "cryptonight_lite_v7_xor", - "cryptonight_v7_stellite", - "cryptonight_masari", - "cryptonight_haven", - "cryptonight_bittube2", - "cryptonight_v8", - "cryptonight_superfast", - "cryptonight_gpu", - "cryptonight_conceal", - "cryptonight_r_wow", - "cryptonight_r", - "cryptonight_v8_reversewaltz" // used by graft - }}; + {{ + "invalid_algo", + "cryptonight", + "cryptonight_lite", + "cryptonight_v7", + "cryptonight_heavy", + "cryptonight_lite_v7", + "cryptonight_lite_v7_xor", + "cryptonight_v7_stellite", + "cryptonight_masari", + "cryptonight_haven", + "cryptonight_bittube2", + "cryptonight_v8", + "cryptonight_superfast", + "cryptonight_gpu", + "cryptonight_conceal", + "cryptonight_r_wow", + "cryptonight_r", + "cryptonight_v8_reversewaltz" // used by graft + }}; static std::array derived_algo_names = - {{ - "cryptonight_turtle", - "cryptonight_v8_half", // used by masari and stellite - "cryptonight_v8_zelerius", - "cryptonight_v8_double" - }}; - + {{"cryptonight_turtle", + "cryptonight_v8_half", // used by masari and stellite + "cryptonight_v8_zelerius", + "cryptonight_v8_double"}}; if(algo_id < start_derived_algo_id) return base_algo_names[algo_id]; @@ -80,19 +77,35 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id) struct xmrstak_algo { - xmrstak_algo(xmrstak_algo_id name_id) : algo_name(name_id), base_algo(name_id) + xmrstak_algo(xmrstak_algo_id name_id) : + algo_name(name_id), + base_algo(name_id) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : algo_name(name_id), base_algo(algorithm) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : + algo_name(name_id), + base_algo(algorithm) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : algo_name(name_id), base_algo(algorithm), iter(iteration) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : + algo_name(name_id), + base_algo(algorithm), + iter(iteration) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : + algo_name(name_id), + base_algo(algorithm), + iter(iteration), + mem(memory) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory), mask(mem_mask) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : + algo_name(name_id), + base_algo(algorithm), + iter(iteration), + mem(memory), + mask(mem_mask) { } @@ -187,35 +200,33 @@ constexpr uint32_t CN_DOUBLE_ITER = 0x100000; inline xmrstak_algo POW(xmrstak_algo_id algo_id) { - static std::array pow = {{ - {invalid_algo, invalid_algo}, + static std::array pow = {{{invalid_algo, invalid_algo}, {cryptonight, cryptonight, CN_ITER, CN_MEMORY}, - {cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2}, + {cryptonight_lite, cryptonight_lite, CN_ITER / 2, CN_MEMORY / 2}, {cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY}, - {cryptonight_heavy, cryptonight_heavy, CN_ITER/2, CN_MEMORY*2}, - {cryptonight_aeon, cryptonight_aeon, CN_ITER/2, CN_MEMORY/2}, - {cryptonight_ipbc, cryptonight_ipbc, CN_ITER/2, CN_MEMORY/2}, // equal to cryptonight_aeon with a small tweak in the miner code - {cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change - {cryptonight_masari, cryptonight_masari, CN_ITER/2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari - {cryptonight_haven, cryptonight_haven, CN_ITER/2, CN_MEMORY*2}, // equal to cryptonight_heavy with a small tweak - {cryptonight_bittube2, cryptonight_bittube2, CN_ITER/2, CN_MEMORY*2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + {cryptonight_heavy, cryptonight_heavy, CN_ITER / 2, CN_MEMORY * 2}, + {cryptonight_aeon, cryptonight_aeon, CN_ITER / 2, CN_MEMORY / 2}, + {cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2}, // equal to cryptonight_aeon with a small tweak in the miner code + {cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change + {cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari + {cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2}, // equal to cryptonight_heavy with a small tweak + {cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks {cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY}, - {cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY}, + {cryptonight_superfast, cryptonight_superfast, CN_ITER / 4, CN_MEMORY}, {cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK}, - {cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY}, + {cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY}, {cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY}, {cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY}, - {cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY} - }}; + {cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}}}; static std::array derived_pow = - {{ - {cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK}, - {cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY}, - {cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY}, - {cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY} - // {cryptonight_derived} - }}; + {{ + {cryptonight_turtle, cryptonight_monero_v8, CN_ITER / 8, CN_MEMORY / 8, CN_TURTLE_MASK}, + {cryptonight_v8_half, cryptonight_monero_v8, CN_ITER / 2, CN_MEMORY}, + {cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY}, + {cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY} + // {cryptonight_derived} + }}; if(algo_id < start_derived_algo_id) return pow[algo_id]; diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp index 52ef3f391..5b4332ba4 100644 --- a/xmrstak/backend/globalStates.cpp +++ b/xmrstak/backend/globalStates.cpp @@ -21,15 +21,14 @@ * */ -#include "miner_work.hpp" #include "globalStates.hpp" +#include "miner_work.hpp" #include -#include #include +#include #include - namespace xmrstak { diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp index d6966c4a2..a67580166 100644 --- a/xmrstak/backend/globalStates.hpp +++ b/xmrstak/backend/globalStates.hpp @@ -1,10 +1,10 @@ #pragma once #include "xmrstak/backend/miner_work.hpp" -#include "xmrstak/misc/environment.hpp" -#include "xmrstak/misc/console.hpp" #include "xmrstak/backend/pool_data.hpp" #include "xmrstak/cpputil/read_write_lock.h" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/environment.hpp" #include @@ -17,7 +17,11 @@ struct globalStates { auto& env = environment::inst(); if(env.pglobalStates == nullptr) - env.pglobalStates = new globalStates; + { + std::unique_lock lck(env.update); + if(env.pglobalStates == nullptr) + env.pglobalStates = new globalStates; + } return *env.pglobalStates; } @@ -32,7 +36,7 @@ struct globalStates nonce = iGlobalNonce.fetch_add(reserve_count); } - void consume_work( miner_work& threadWork, uint64_t& currentJobId); + void consume_work(miner_work& threadWork, uint64_t& currentJobId); miner_work oGlobalWork; std::atomic iGlobalJobNo; @@ -41,8 +45,11 @@ struct globalStates uint64_t iThreadCount; size_t pool_id = invalid_pool_id; -private: - globalStates() : iThreadCount(0), iGlobalJobNo(0), iConsumeCnt(0) + private: + globalStates() : + iThreadCount(0), + iGlobalJobNo(0), + iConsumeCnt(0) { } diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp index 18411b79c..3ca598bdd 100644 --- a/xmrstak/backend/iBackend.hpp +++ b/xmrstak/backend/iBackend.hpp @@ -1,12 +1,13 @@ #pragma once #include "xmrstak/backend/globalStates.hpp" +#include "xmrstak/net/msgstruct.hpp" #include -#include #include -#include +#include #include +#include template constexpr std::size_t countof(T const (&)[N]) noexcept @@ -16,35 +17,66 @@ constexpr std::size_t countof(T const (&)[N]) noexcept namespace xmrstak { - struct iBackend +struct iBackend +{ + + enum BackendType : uint32_t { + UNKNOWN = 0u, + CPU = 1u, + AMD = 2u, + NVIDIA = 3u + }; - enum BackendType : uint32_t { UNKNOWN = 0u, CPU = 1u, AMD = 2u, NVIDIA = 3u }; + static const char* getName(const BackendType type) + { + const char* backendNames[] = { + "unknown", + "cpu", + "amd", + "nvidia"}; - static const char* getName(const BackendType type) - { - const char* backendNames[] = { - "unknown", - "cpu", - "amd", - "nvidia" - }; - - uint32_t i = static_cast(type); - if(i >= countof(backendNames)) - i = 0; - - return backendNames[i]; - } + uint32_t i = static_cast(type); + if(i >= countof(backendNames)) + i = 0; + + return backendNames[i]; + } - std::atomic iHashCount; - std::atomic iTimestamp; - uint32_t iThreadNo; - BackendType backendType = UNKNOWN; + std::atomic iHashCount; + std::atomic iTimestamp; + uint32_t iThreadNo; + uint32_t iGpuIndex; + BackendType backendType = UNKNOWN; + uint64_t iLastStamp = get_timestamp_ms(); + double avgHashPerMsec = 0.0; - iBackend() : iHashCount(0), iTimestamp(0) + void updateStats(uint64_t numNewHashes, size_t poolId) + { + uint64_t iStamp = get_timestamp_ms(); + double timeDiff = static_cast(iStamp - iLastStamp); + iLastStamp = iStamp; + + if(poolId == 0) { + // if dev pool is active interpolate the number of shares (avoid hash rate drops) + numNewHashes = static_cast(avgHashPerMsec * timeDiff); } - }; + else + { + const double hashRatePerMs = static_cast(numNewHashes) / timeDiff; + constexpr double averagingBias = 0.1; + avgHashPerMsec = avgHashPerMsec * (1.0 - averagingBias) + hashRatePerMs * averagingBias; + } + iHashCount.fetch_add(numNewHashes, std::memory_order_relaxed); + iTimestamp.store(iStamp, std::memory_order_relaxed); + } + + iBackend() : + iHashCount(0), + iTimestamp(0) + { + } +}; } // namespace xmrstak diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp index d0e5237f2..114f2db8e 100644 --- a/xmrstak/backend/miner_work.hpp +++ b/xmrstak/backend/miner_work.hpp @@ -2,95 +2,110 @@ #include "xmrstak/backend/pool_data.hpp" -#include #include -#include -#include -#include #include +#include #include +#include +#include +#include namespace xmrstak { - struct miner_work +struct miner_work +{ + char sJobID[64]; + uint8_t bWorkBlob[128]; + uint32_t iWorkSize; + uint64_t iTarget; + bool bNiceHash; + bool bStall; + size_t iPoolId; + uint64_t iBlockHeight; + uint8_t* ref_ptr; + + miner_work() : + iWorkSize(0), + bNiceHash(false), + bStall(true), + iPoolId(invalid_pool_id), + ref_ptr((uint8_t*)&iBlockHeight) {} + + miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, + uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : + iWorkSize(iWorkSize), + iTarget(iTarget), + bNiceHash(bNiceHash), + bStall(false), + iPoolId(iPoolId), + iBlockHeight(iBlockHeiht), + ref_ptr((uint8_t*)&iBlockHeight) { - char sJobID[64]; - uint8_t bWorkBlob[128]; - uint32_t iWorkSize; - uint64_t iTarget; - bool bNiceHash; - bool bStall; - size_t iPoolId; - uint64_t iBlockHeight; - uint8_t* ref_ptr; - - miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id), ref_ptr((uint8_t*)&iBlockHeight) { } - - miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, - uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize), - iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight) - { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(this->bWorkBlob, bWork, iWorkSize); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); - } - - miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget), - bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) - { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); - } - - miner_work(miner_work const&) = delete; - - miner_work& operator=(miner_work&& from) - { - assert(this != &from); - - iBlockHeight = from.iBlockHeight; - iPoolId = from.iPoolId; - bStall = from.bStall; - iWorkSize = from.iWorkSize; - bNiceHash = from.bNiceHash; - iTarget = from.iTarget; - - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(this->bWorkBlob, bWork, iWorkSize); + memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + } + + miner_work(miner_work&& from) : + iWorkSize(from.iWorkSize), + iTarget(from.iTarget), + bStall(from.bStall), + iPoolId(from.iPoolId), + iBlockHeight(from.iBlockHeight), + ref_ptr((uint8_t*)&iBlockHeight) + { + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + } - return *this; - } + miner_work(miner_work const&) = delete; + + miner_work& operator=(miner_work&& from) + { + assert(this != &from); - miner_work& operator=(miner_work const& from) - { - assert(this != &from); + iBlockHeight = from.iBlockHeight; + iPoolId = from.iPoolId; + bStall = from.bStall; + iWorkSize = from.iWorkSize; + bNiceHash = from.bNiceHash; + iTarget = from.iTarget; - iBlockHeight = from.iBlockHeight; - iPoolId = from.iPoolId; - bStall = from.bStall; - iWorkSize = from.iWorkSize; - bNiceHash = from.bNiceHash; - iTarget = from.iTarget; + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(sJobID, from.sJobID, sizeof(sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - if(!ref_ptr) - return *this; + return *this; + } - for(size_t i=0; i <= 7 && iPoolId; i++) - ref_ptr[i] = from.ref_ptr[7-i]; + miner_work& operator=(miner_work const& from) + { + assert(this != &from); - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + iBlockHeight = from.iBlockHeight; + iPoolId = from.iPoolId; + bStall = from.bStall; + iWorkSize = from.iWorkSize; + bNiceHash = from.bNiceHash; + iTarget = from.iTarget; + if(!ref_ptr) return *this; - } - uint8_t getVersion() const - { - return bWorkBlob[0]; - } + for(size_t i = 0; i <= 7 && iPoolId; i++) + ref_ptr[i] = from.ref_ptr[7 - i]; + + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(sJobID, from.sJobID, sizeof(sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - }; + return *this; + } + + uint8_t getVersion() const + { + return bWorkBlob[0]; + } +}; } // namespace xmrstak diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp index f1bf75819..a7587cbe0 100644 --- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp +++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp @@ -14,17 +14,17 @@ * */ -#include -#include -#include #include +#include #include +#include +#include #include -#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" #include "xmrstak/backend/cpu/crypto/variant4_random_math.h" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" #include "xmrstak/cpputil/read_write_lock.h" +#include "xmrstak/misc/console.hpp" namespace xmrstak { @@ -33,80 +33,82 @@ namespace nvidia static std::string get_code(const V4_Instruction* code, int code_size) { - std::stringstream s; + std::stringstream s; - for (int i = 0; i < code_size; ++i) - { - const V4_Instruction inst = code[i]; + for(int i = 0; i < code_size; ++i) + { + const V4_Instruction inst = code[i]; - const uint32_t a = inst.dst_index; - const uint32_t b = inst.src_index; + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; - switch (inst.opcode) - { - case MUL: - s << 'r' << a << "*=r" << b << ';'; - break; + switch(inst.opcode) + { + case MUL: + s << 'r' << a << "*=r" << b << ';'; + break; - case ADD: - s << 'r' << a << "+=r" << b << '+' << inst.C << "U;"; - break; + case ADD: + s << 'r' << a << "+=r" << b << '+' << inst.C << "U;"; + break; - case SUB: - s << 'r' << a << "-=r" << b << ';'; - break; + case SUB: + s << 'r' << a << "-=r" << b << ';'; + break; - case ROR: - s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");"; - break; + case ROR: + s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");"; + break; - case ROL: - s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");"; - break; + case ROL: + s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");"; + break; - case XOR: - s << 'r' << a << "^=r" << b << ';'; - break; - } + case XOR: + s << 'r' << a << "^=r" << b << ';'; + break; + } - s << '\n'; - } + s << '\n'; + } - return s.str(); + return s.str(); } struct CacheEntry { - CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector& ptx, const std::string& lowered_name) : - algo(algo), - height(height), - arch_major(arch_major), - arch_minor(arch_minor), - ptx(ptx), - lowered_name(lowered_name) - {} - - xmrstak_algo algo; - uint64_t height; - int arch_major; - int arch_minor; - std::vector ptx; - std::string lowered_name; + CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector& ptx, const std::string& lowered_name) : + algo(algo), + height(height), + arch_major(arch_major), + arch_minor(arch_minor), + ptx(ptx), + lowered_name(lowered_name) + { + } + + xmrstak_algo algo; + uint64_t height; + int arch_major; + int arch_minor; + std::vector ptx; + std::string lowered_name; }; struct BackgroundTaskBase { - virtual ~BackgroundTaskBase() {} - virtual void exec() = 0; + virtual ~BackgroundTaskBase() {} + virtual void exec() = 0; }; -template +template struct BackgroundTask : public BackgroundTaskBase { - BackgroundTask(T&& func) : m_func(std::move(func)) {} - void exec() override { m_func(); } + BackgroundTask(T&& func) : + m_func(std::move(func)) {} + void exec() override { m_func(); } - T m_func; + T m_func; }; static ::cpputil::RWLock CryptonightR_cache_mutex; @@ -119,155 +121,165 @@ static std::thread* background_thread = nullptr; static void background_thread_proc() { - std::vector tasks; - for (;;) { - tasks.clear(); - { - std::lock_guard g(background_tasks_mutex); - background_tasks.swap(tasks); - } - - for (BackgroundTaskBase* task : tasks) { - task->exec(); - delete task; - } - - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } + std::vector tasks; + for(;;) + { + tasks.clear(); + { + std::lock_guard g(background_tasks_mutex); + background_tasks.swap(tasks); + } + + for(BackgroundTaskBase* task : tasks) + { + task->exec(); + delete task; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } } -template +template static void background_exec(T&& func) { - BackgroundTaskBase* task = new BackgroundTask(std::move(func)); - - std::lock_guard g(background_tasks_mutex); - background_tasks.push_back(task); - if (!background_thread) { - background_thread = new std::thread(background_thread_proc); - } + BackgroundTaskBase* task = new BackgroundTask(std::move(func)); + + std::lock_guard g(background_tasks_mutex); + background_tasks.push_back(task); + if(!background_thread) + { + background_thread = new std::thread(background_thread_proc); + } } static void CryptonightR_build_program( - std::vector& ptx, - std::string& lowered_name, - const xmrstak_algo& algo, - uint64_t height, - uint32_t precompile_count, - int arch_major, - int arch_minor, - std::string source) + std::vector& ptx, + std::string& lowered_name, + const xmrstak_algo& algo, + uint64_t height, + uint32_t precompile_count, + int arch_major, + int arch_minor, + std::string source) { - { + { CryptonightR_cache_mutex.WriteLock(); - // Remove old programs from cache - for (size_t i = 0; i < CryptonightR_cache.size();) - { - const CacheEntry& entry = CryptonightR_cache[i]; - if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); - CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); - CryptonightR_cache.pop_back(); - } - else - { - ++i; - } - } + // Remove old programs from cache + for(size_t i = 0; i < CryptonightR_cache.size();) + { + const CacheEntry& entry = CryptonightR_cache[i]; + if((entry.algo == algo) && (entry.height + 2 + precompile_count < height)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); + CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); + CryptonightR_cache.pop_back(); + } + else + { + ++i; + } + } CryptonightR_cache_mutex.UnLock(); - } + } - ptx.clear(); - ptx.reserve(65536); + ptx.clear(); + ptx.reserve(65536); - std::lock_guard g1(CryptonightR_build_mutex); - { + std::lock_guard g1(CryptonightR_build_mutex); + { CryptonightR_cache_mutex.ReadLock(); - // Check if the cache already has this program (some other thread might have added it first) - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) - { - ptx = entry.ptx; - lowered_name = entry.lowered_name; + // Check if the cache already has this program (some other thread might have added it first) + for(const CacheEntry& entry : CryptonightR_cache) + { + if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) + { + ptx = entry.ptx; + lowered_name = entry.lowered_name; CryptonightR_cache_mutex.UnLock(); - return; - } - } + return; + } + } CryptonightR_cache_mutex.UnLock(); - } - - nvrtcProgram prog; - nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result)); - return; - } - - result = nvrtcAddNameExpression(prog, "CryptonightR_phase2"); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - char opt0[64]; - sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor); - - char opt1[64]; - sprintf(opt1, "-DALGO=%d", static_cast(algo.Id())); - - const char* opts[2] = { opt0, opt1 }; - - result = nvrtcCompileProgram(prog, 2, opts); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result)); - - size_t logSize; - if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) { - char *log = new char[logSize]; - if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "Program compile log: %s", log); - } - delete[]log; - } - nvrtcDestroyProgram(&prog); - return; - } - - - const char* name; - result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - size_t ptxSize; - result = nvrtcGetPTXSize(prog, &ptxSize); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - ptx.resize(ptxSize); - result = nvrtcGetPTX(prog, ptx.data()); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - lowered_name = name; - - nvrtcDestroyProgram(&prog); - - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); + } + + nvrtcProgram prog; + nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result)); + return; + } + + result = nvrtcAddNameExpression(prog, "CryptonightR_phase2"); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + char opt0[64]; + sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor); + + char opt1[64]; + sprintf(opt1, "-DALGO=%d", static_cast(algo.Id())); + + const char* opts[2] = {opt0, opt1}; + + result = nvrtcCompileProgram(prog, 2, opts); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result)); + + size_t logSize; + if(nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) + { + char* log = new char[logSize]; + if(nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "Program compile log: %s", log); + } + delete[] log; + } + nvrtcDestroyProgram(&prog); + return; + } + + const char* name; + result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + size_t ptxSize; + result = nvrtcGetPTXSize(prog, &ptxSize); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + ptx.resize(ptxSize); + result = nvrtcGetPTX(prog, ptx.data()); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + lowered_name = name; + + nvrtcDestroyProgram(&prog); + + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); CryptonightR_cache_mutex.WriteLock(); CryptonightR_cache.emplace_back(algo, height, arch_major, arch_minor, ptx, lowered_name); @@ -276,62 +288,63 @@ static void CryptonightR_build_program( void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background) { - if (background) { - background_exec([=]() { std::vector tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); }); - return; - } - - ptx.clear(); - - const char* source_code_template = - #include "nvcc_code/cuda_cryptonight_r.curt" - ; - const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; - const char* offset = strstr(source_code_template, include_name); - if (!offset) - { - printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt"); - return; - } - - V4_Instruction code[256]; - int code_size; - switch (algo.Id()) - { - case cryptonight_r_wow: - code_size = v4_random_math_init(code, height); - break; - case cryptonight_r: - code_size = v4_random_math_init(code, height); - break; - printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo); - return; - } - - std::string source_code(source_code_template, offset); - source_code.append(get_code(code, code_size)); - source_code.append(offset + sizeof(include_name) - 1); - - { + if(background) + { + background_exec([=]() { std::vector tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); }); + return; + } + + ptx.clear(); + + const char* source_code_template = +#include "nvcc_code/cuda_cryptonight_r.curt" + ; + const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; + const char* offset = strstr(source_code_template, include_name); + if(!offset) + { + printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt"); + return; + } + + V4_Instruction code[256]; + int code_size; + switch(algo.Id()) + { + case cryptonight_r_wow: + code_size = v4_random_math_init(code, height); + break; + case cryptonight_r: + code_size = v4_random_math_init(code, height); + break; + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo); + return; + } + + std::string source_code(source_code_template, offset); + source_code.append(get_code(code, code_size)); + source_code.append(offset + sizeof(include_name) - 1); + + { CryptonightR_cache_mutex.ReadLock(); - // Check if the cache has this program - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); - ptx = entry.ptx; - lowered_name = entry.lowered_name; + // Check if the cache has this program + for(const CacheEntry& entry : CryptonightR_cache) + { + if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); + ptx = entry.ptx; + lowered_name = entry.lowered_name; CryptonightR_cache_mutex.UnLock(); - return; - } - } + return; + } + } CryptonightR_cache_mutex.UnLock(); - } + } - CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code); + CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code); } +} // namespace nvidia } // namespace xmrstak -} //namespace nvidia diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp index c3d8827b0..30abf2e59 100644 --- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp +++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp @@ -19,9 +19,8 @@ #include "xmrstak/backend/cryptonight.hpp" #include -#include #include - +#include namespace xmrstak { @@ -29,9 +28,7 @@ namespace nvidia { void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, - const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background = false); - + const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background = false); +} // namespace nvidia } // namespace xmrstak -} //namespace nvidia - diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index 2755e03d2..bf195f768 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -3,17 +3,16 @@ #include "autoAdjust.hpp" -#include "nvcc_code/cryptonight.hpp" #include "jconf.hpp" -#include "xmrstak/misc/console.hpp" +#include "nvcc_code/cryptonight.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include #include #include #include - +#include namespace xmrstak { @@ -22,11 +21,9 @@ namespace nvidia class autoAdjust { -public: - + public: autoAdjust() { - } /** print the adjusted values if needed @@ -63,45 +60,69 @@ class autoAdjust nvidCtxVec.push_back(ctx); else printer::inst()->print_msg(L0, "WARNING: NVIDIA setup failed for GPU %d.\n", i); - } generateThreadConfig(); return true; - } -private: - + private: void generateThreadConfig() { // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; + const char* tpl = +#include "./config.tpl" + ; configEditor configTpl{}; - configTpl.set( std::string(tpl) ); + configTpl.set(std::string(tpl)); constexpr size_t byte2mib = 1024u * 1024u; std::string conf; for(auto& ctx : nvidCtxVec) { + std::string enabledGpus = params::inst().nvidiaGpus; + bool enabled = true; + if (!enabledGpus.empty()) + { + enabled = false; + std::stringstream ss(enabledGpus); + + int i = -1; + while (ss >> i) + { + if (i == ctx.device_id) + { + enabled = true; + break; + } + + while (ss.peek() == ',' || ss.peek() == ' ') + ss.ignore(); + } + } + if(ctx.device_threads * ctx.device_blocks > 0) { + if (!enabled) + conf += "/* Disabled\n"; + conf += std::string(" // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n"; - conf += std::string(" // memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n"; + conf += std::string(" // memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n"; conf += std::string(" // smx: ") + std::to_string(ctx.device_mpcount) + "\n"; conf += std::string(" { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" + - " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + - " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + - " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + - " \"mem_mode\" : 1,\n" + - " },\n"; + " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + + " \"mem_mode\" : 1,\n" + + " },\n"; + + if (!enabled) + conf += "*/\n"; } } - configTpl.replace("GPUCONFIG",conf); + configTpl.replace("GPUCONFIG", conf); configTpl.write(params::inst().configFileNVIDIA); printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", params::inst().configFileNVIDIA.c_str()); } diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp index 6c443343b..1cd113c4d 100644 --- a/xmrstak/backend/nvidia/jconf.cpp +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -22,8 +22,8 @@ */ #include "jconf.hpp" -#include "xmrstak/misc/jext.hpp" #include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/jext.hpp" #include #include @@ -36,7 +36,6 @@ #include #endif - namespace xmrstak { namespace nvidia @@ -47,9 +46,13 @@ using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { aGpuThreadsConf }; +enum configEnum +{ + aGpuThreadsConf +}; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -58,8 +61,7 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aGpuThreadsConf, "gpu_threads_conf", kNullType } -}; + {aGpuThreadsConf, "gpu_threads_conf", kNullType}}; inline bool checkType(Type have, Type want) { @@ -75,9 +77,7 @@ inline bool checkType(Type have, Type want) return false; } -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); - - +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); struct jconf::opaque_private { @@ -89,7 +89,6 @@ struct jconf::opaque_private } }; - bool jconf::NeedsAutoconf() { return !prv->configValues[aGpuThreadsConf]->IsArray(); @@ -110,7 +109,7 @@ size_t jconf::GetGPUThreadCount() return 0; } -bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) +bool jconf::GetGPUThreadConfig(size_t id, thd_cfg& cfg) { if(!prv->configValues[aGpuThreadsConf]->IsArray()) return false; @@ -170,7 +169,6 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) return false; } - cfg.id = gid->GetInt(); cfg.blocks = blocks->GetInt(); cfg.threads = threads->GetInt(); @@ -178,7 +176,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) cfg.bsleep = bsleep->GetInt(); cfg.syncMode = syncMode->GetInt(); cfg.memMode = memMode->GetInt(); - + if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt(); else @@ -189,22 +187,22 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) bool jconf::parse_config(const char* sFilename) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -219,7 +217,7 @@ bool jconf::parse_config(const char* sFilename) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -241,7 +239,7 @@ bool jconf::parse_config(const char* sFilename) buffer[flen] = '}'; buffer[flen + 1] = '\0'; - prv->jsonDoc.Parse(buffer, flen+2); + prv->jsonDoc.Parse(buffer, flen + 2); free(buffer); if(prv->jsonDoc.HasParseError()) @@ -251,7 +249,6 @@ bool jconf::parse_config(const char* sFilename) return false; } - if(!prv->jsonDoc.IsObject()) { //This should never happen as we created the root ourselves printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename); @@ -262,7 +259,7 @@ bool jconf::parse_config(const char* sFilename) { if(oConfigValues[i].iName != i) { - printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s",oConfigValues[i].sName); + printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s", oConfigValues[i].sName); return false; } diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp index 40b72f880..e924c75a9 100644 --- a/xmrstak/backend/nvidia/jconf.hpp +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -1,7 +1,7 @@ #pragma once +#include "xmrstak/params.hpp" #include #include -#include "xmrstak/params.hpp" namespace xmrstak { @@ -10,16 +10,18 @@ namespace nvidia class jconf { -public: + public: static jconf* inst() { - if (oInst == nullptr) oInst = new jconf; + if(oInst == nullptr) + oInst = new jconf; return oInst; }; bool parse_config(const char* sFilename = params::inst().configFileNVIDIA.c_str()); - struct thd_cfg { + struct thd_cfg + { uint32_t id; uint32_t blocks; uint32_t threads; @@ -36,17 +38,16 @@ class jconf size_t GetGPUThreadCount(); - bool GetGPUThreadConfig(size_t id, thd_cfg &cfg); + bool GetGPUThreadConfig(size_t id, thd_cfg& cfg); bool NeedsAutoconf(); -private: + private: jconf(); static jconf* oInst; struct opaque_private; opaque_private* prv; - }; } // namespace nvidia diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 80615d7a3..c65bba162 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -23,23 +23,23 @@ #include "minethd.hpp" #include "autoAdjust.hpp" -#include "xmrstak/misc/console.hpp" -#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" #include "xmrstak/backend/cpu/crypto/cryptonight.h" +#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" +#include "xmrstak/backend/cpu/hwlocMemory.hpp" #include "xmrstak/backend/cpu/minethd.hpp" -#include "xmrstak/params.hpp" -#include "xmrstak/misc/executor.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/misc/environment.hpp" -#include "xmrstak/backend/cpu/hwlocMemory.hpp" -#include "xmrstak/backend/cryptonight.hpp" +#include "xmrstak/misc/executor.hpp" #include "xmrstak/misc/utility.hpp" +#include "xmrstak/params.hpp" #include -#include +#include #include +#include #include -#include #include #ifndef USE_PRECOMPILED_HEADERS @@ -47,8 +47,8 @@ #include #include #else -#include #include +#include #endif #include #endif @@ -59,9 +59,9 @@ namespace nvidia { #ifdef WIN32 - HINSTANCE lib_handle; +HINSTANCE lib_handle; #else - void *lib_handle; +void* lib_handle; #endif minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) @@ -70,6 +70,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) oWork = pWork; bQuit = 0; iThreadNo = (uint8_t)iNo; + this->iGpuIndex = cfg.id; iJobNo = 0; ctx.device_id = (int)cfg.id; @@ -81,6 +82,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) ctx.memMode = cfg.memMode; this->affinity = cfg.cpu_aff; + std::unique_lock lck(thd_aff_set); std::future numa_guard = numa_promise.get_future(); thread_work_guard = thread_work_promise.get_future(); @@ -91,33 +93,32 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) * without concurrent threads (CUDA driver is less occupied). */ numa_guard.wait(); -} -void minethd::start_mining() -{ - thread_work_promise.set_value(); if(this->affinity >= 0) //-1 means no affinity if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity)) printer::inst()->print_msg(L1, "WARNING setting affinity failed."); } +void minethd::start_mining() +{ + thread_work_promise.set_value(); +} bool minethd::self_test() { return true; } - extern "C" { #ifdef WIN32 -__declspec(dllexport) + __declspec(dllexport) #endif -std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) -{ - environment::inst(&env); - return nvidia::minethd::thread_starter(threadOffset, pWork); -} + std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) + { + environment::inst(&env); + return nvidia::minethd::thread_starter(threadOffset, pWork); + } } // extern "C" std::vector* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork) @@ -141,12 +142,12 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor int deviceCount = 0; if(cuda_get_devicecount(&deviceCount) != 1) { - std::cout<<"WARNING: NVIDIA no device found"<GetGPUThreadCount(); @@ -155,7 +156,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor cuInit(0); jconf::thd_cfg cfg; - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { jconf::inst()->GetGPUThreadConfig(i, cfg); @@ -172,10 +173,9 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor minethd* thd = new minethd(pWork, i + threadOffset, cfg); pvThreads->push_back(thd); - } - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { static_cast((*pvThreads)[i])->start_mining(); } @@ -196,12 +196,12 @@ void minethd::work_main() // numa memory bind and gpu memory is initialized numa_promise.set_value(); - + std::unique_lock lck(thd_aff_set); + lck.unlock(); std::this_thread::yield(); // wait until all NVIDIA devices are initialized thread_work_guard.wait(); - uint64_t iCount = 0; cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); @@ -216,16 +216,16 @@ void minethd::work_main() uint8_t version = 0; size_t lastPoolId = 0; - while (bQuit == 0) + while(bQuit == 0) { - if (oWork.bStall) + if(oWork.bStall) { /* We are stalled here because the executor didn't find a job for us yet, * either because of network latency, or a socket problem. Since we are * raison d'etre of this software it us sensible to just wait until we have something */ - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); globalStates::inst().consume_work(oWork, iJobNo); @@ -285,8 +285,8 @@ void minethd::work_main() for(size_t i = 0; i < foundCount; i++) { - uint8_t bWorkBlob[128]; - uint8_t bResult[32]; + uint8_t bWorkBlob[128]; + uint8_t bResult[32]; memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); memset(bResult, 0, sizeof(job_result::bResult)); @@ -294,19 +294,14 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = foundNonce[i]; cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo); - if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) + if((*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId)); } - iCount += h_per_round; iNonce += h_per_round; - - using namespace std::chrono; - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); + updateStats(h_per_round, oWork.iPoolId); std::this_thread::yield(); } @@ -314,5 +309,5 @@ void minethd::work_main() } } +} // namespace nvidia } // namespace xmrstak -} //namespace nvidia diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp index 3863c93e8..bbbc7b6ee 100644 --- a/xmrstak/backend/nvidia/minethd.hpp +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -1,19 +1,18 @@ #pragma once -#include "xmrstak/jconf.hpp" #include "jconf.hpp" #include "nvcc_code/cryptonight.hpp" +#include "xmrstak/jconf.hpp" #include "xmrstak/backend/cpu/minethd.hpp" #include "xmrstak/backend/iBackend.hpp" #include "xmrstak/misc/environment.hpp" +#include +#include #include #include -#include #include -#include - namespace xmrstak { @@ -22,12 +21,11 @@ namespace nvidia class minethd : public iBackend { -public: - + public: static std::vector* thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); -private: + private: typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); @@ -44,6 +42,7 @@ class minethd : public iBackend std::promise numa_promise; std::promise thread_work_promise; + std::mutex thd_aff_set; // block thread until all NVIDIA GPUs are initialized std::future thread_work_guard; diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index 906701893..29e29d12c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -3,35 +3,37 @@ #include #include -#include "xmrstak/jconf.hpp" #include "xmrstak/backend/cryptonight.hpp" +#include "xmrstak/jconf.hpp" #include -typedef struct { +typedef struct +{ int device_id; - const char *device_name; + const char* device_name; int device_arch[2]; int device_mpcount; int device_blocks; int device_threads; int device_bfactor; int device_bsleep; + int device_maxThreadsPerBlock; int syncMode; bool memMode; - uint32_t *d_input; + uint32_t* d_input; uint32_t inputlen; - uint32_t *d_result_count; - uint32_t *d_result_nonce; - uint32_t *d_long_state; - uint32_t *d_ctx_state; - uint32_t *d_ctx_state2; - uint32_t *d_ctx_a; - uint32_t *d_ctx_b; - uint32_t *d_ctx_key1; - uint32_t *d_ctx_key2; - uint32_t *d_ctx_text; + uint32_t* d_result_count; + uint32_t* d_result_nonce; + uint32_t* d_long_state; + uint32_t* d_ctx_state; + uint32_t* d_ctx_state2; + uint32_t* d_ctx_a; + uint32_t* d_ctx_b; + uint32_t* d_ctx_key1; + uint32_t* d_ctx_key2; + uint32_t* d_ctx_text; std::string name; size_t free_device_memory; size_t total_device_memory; @@ -43,19 +45,20 @@ typedef struct { xmrstak_algo cached_algo = {xmrstak_algo_id::invalid_algo}; } nvid_ctx; -extern "C" { +extern "C" +{ -/** get device count + /** get device count * * @param deviceCount[out] cuda device count * @return error code: 0 == error is occurred, 1 == no error */ -int cuda_get_devicecount( int* deviceCount); -int cuda_get_deviceinfo(nvid_ctx *ctx); -int cryptonight_extra_cpu_init(nvid_ctx *ctx); -void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len); -void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo); -void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo); + int cuda_get_devicecount(int* deviceCount); + int cuda_get_deviceinfo(nvid_ctx* ctx); + int cryptonight_extra_cpu_init(nvid_ctx* ctx); + void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len); + void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo); + void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo); } void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp index 199025635..385afb9ec 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp @@ -3,8 +3,270 @@ #include -#define N_COLS 4 -#define WPOLY 0x011b +#include "cuda_extra.hpp" + +#define N_COLS 4 +#define WPOLY 0x011b + +static __constant__ uint32_t d_t_fn256[256 * 32] = +{ + 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, 0xa56363c6U, + 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, 0x847c7cf8U, + 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, 0x997777eeU, + 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, 0x8d7b7bf6U, + 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, 0xdf2f2ffU, + 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, 0xbd6b6bd6U, + 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, 0xb16f6fdeU, + 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, 0x54c5c591U, + 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, 0x50303060U, + 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, 0x3010102U, + 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, 0xa96767ceU, + 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, 0x7d2b2b56U, + 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, 0x19fefee7U, + 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, 0x62d7d7b5U, + 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, 0xe6abab4dU, + 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, 0x9a7676ecU, + 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, 0x45caca8fU, + 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, 0x9d82821fU, + 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, 0x40c9c989U, + 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, 0x877d7dfaU, + 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, 0x15fafaefU, + 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, 0xeb5959b2U, + 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, 0xc947478eU, + 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, 0xbf0f0fbU, + 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, 0xecadad41U, + 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, 0x67d4d4b3U, + 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, 0xfda2a25fU, + 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, 0xeaafaf45U, + 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, 0xbf9c9c23U, + 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, 0xf7a4a453U, + 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, 0x967272e4U, + 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, 0x5bc0c09bU, + 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, 0xc2b7b775U, + 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, 0x1cfdfde1U, + 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, 0xae93933dU, + 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, 0x6a26264cU, + 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, 0x5a36366cU, + 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, 0x413f3f7eU, + 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, 0x2f7f7f5U, + 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, 0x4fcccc83U, + 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, 0x5c343468U, + 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, 0xf4a5a551U, + 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, 0x34e5e5d1U, + 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, 0x8f1f1f9U, + 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, 0x937171e2U, + 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, 0x73d8d8abU, + 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, 0x53313162U, + 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, 0x3f15152aU, + 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, 0xc040408U, + 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, 0x52c7c795U, + 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, 0x65232346U, + 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, 0x5ec3c39dU, + 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, 0x28181830U, + 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, 0xa1969637U, + 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, 0xf05050aU, + 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, 0xb59a9a2fU, + 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, 0x907070eU, + 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, 0x36121224U, + 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, 0x9b80801bU, + 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, 0x3de2e2dfU, + 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, 0x26ebebcdU, + 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, 0x6927274eU, + 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, 0xcdb2b27fU, + 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, 0x9f7575eaU, + 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, 0x1b090912U, + 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, 0x9e83831dU, + 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, 0x742c2c58U, + 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, 0x2e1a1a34U, + 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, 0x2d1b1b36U, + 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, 0xb26e6edcU, + 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, 0xee5a5ab4U, + 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, 0xfba0a05bU, + 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, 0xf65252a4U, + 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, 0x4d3b3b76U, + 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, 0x61d6d6b7U, + 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, 0xceb3b37dU, + 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, 0x7b292952U, + 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, 0x3ee3e3ddU, + 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, 0x712f2f5eU, + 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, 0x97848413U, + 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, 0xf55353a6U, + 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, 0x68d1d1b9U, + 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, + 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, 0x2cededc1U, + 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, 0x60202040U, + 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, 0x1ffcfce3U, + 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, 0xc8b1b179U, + 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, 0xed5b5bb6U, + 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, 0xbe6a6ad4U, + 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, 0x46cbcb8dU, + 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, 0xd9bebe67U, + 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, 0x4b393972U, + 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, 0xde4a4a94U, + 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, 0xd44c4c98U, + 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, 0xe85858b0U, + 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, 0x4acfcf85U, + 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, 0x6bd0d0bbU, + 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, 0x2aefefc5U, + 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, 0xe5aaaa4fU, + 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, 0x16fbfbedU, + 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, 0xc5434386U, + 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, 0xd74d4d9aU, + 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, 0x55333366U, + 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, 0x94858511U, + 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, 0xcf45458aU, + 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, 0x10f9f9e9U, + 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, 0x6020204U, + 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, 0x817f7ffeU, + 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, 0xf05050a0U, + 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, 0x443c3c78U, + 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, 0xba9f9f25U, + 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, 0xe3a8a84bU, + 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, 0xf35151a2U, + 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, 0xfea3a35dU, + 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, 0xc0404080U, + 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, 0x8a8f8f05U, + 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, 0xad92923fU, + 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, 0xbc9d9d21U, + 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, 0x48383870U, + 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, 0x4f5f5f1U, + 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, 0xdfbcbc63U, + 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, 0xc1b6b677U, + 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, 0x75dadaafU, + 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, 0x63212142U, + 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, 0x30101020U, + 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, 0x1affffe5U, + 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, 0xef3f3fdU, + 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, 0x6dd2d2bfU, + 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, 0x4ccdcd81U, + 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, 0x140c0c18U, + 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, 0x35131326U, + 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, 0x2fececc3U, + 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, 0xe15f5fbeU, + 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, 0xa2979735U, + 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, 0xcc444488U, + 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, 0x3917172eU, + 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, 0x57c4c493U, + 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, 0xf2a7a755U, + 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, 0x827e7efcU, + 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, 0x473d3d7aU, + 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, 0xac6464c8U, + 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, 0xe75d5dbaU, + 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, 0x2b191932U, + 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, 0x957373e6U, + 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, 0xa06060c0U, + 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, 0x98818119U, + 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, 0xd14f4f9eU, + 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, 0x7fdcdca3U, + 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, 0x66222244U, + 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, 0x7e2a2a54U, + 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, 0xab90903bU, + 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, 0x8388880bU, + 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, 0xca46468cU, + 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, 0x29eeeec7U, + 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, 0xd3b8b86bU, + 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, 0x3c141428U, + 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, 0x79dedea7U, + 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, 0xe25e5ebcU, + 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, 0x1d0b0b16U, + 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, 0x76dbdbadU, + 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, 0x3be0e0dbU, + 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, 0x56323264U, + 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, 0x4e3a3a74U, + 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, 0x1e0a0a14U, + 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, 0xdb494992U, + 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, 0xa06060cU, + 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, 0x6c242448U, + 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, 0xe45c5cb8U, + 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, 0x5dc2c29fU, + 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, 0x6ed3d3bdU, + 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, 0xefacac43U, + 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, 0xa66262c4U, + 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, 0xa8919139U, + 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, 0xa4959531U, + 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, 0x37e4e4d3U, + 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, 0x8b7979f2U, + 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, 0x32e7e7d5U, + 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, 0x43c8c88bU, + 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, 0x5937376eU, + 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, 0xb76d6ddaU, + 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, 0x8c8d8d01U, + 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, 0x64d5d5b1U, + 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, 0xd24e4e9cU, + 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, 0xe0a9a949U, + 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, 0xb46c6cd8U, + 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, 0xfa5656acU, + 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, 0x7f4f4f3U, + 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, 0x25eaeacfU, + 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, 0xaf6565caU, + 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, 0x8e7a7af4U, + 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, 0xe9aeae47U, + 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, 0x18080810U, + 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, 0xd5baba6fU, + 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, 0x887878f0U, + 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, 0x6f25254aU, + 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, 0x722e2e5cU, + 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, 0x241c1c38U, + 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, 0xf1a6a657U, + 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, 0xc7b4b473U, + 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, 0x51c6c697U, + 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, 0x23e8e8cbU, + 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, 0x7cdddda1U, + 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, 0x9c7474e8U, + 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, 0x211f1f3eU, + 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, 0xdd4b4b96U, + 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, 0xdcbdbd61U, + 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, 0x868b8b0dU, + 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, 0x858a8a0fU, + 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, 0x907070e0U, + 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, 0x423e3e7cU, + 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, 0xc4b5b571U, + 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, 0xaa6666ccU, + 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, 0xd8484890U, + 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, 0x5030306U, + 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, 0x1f6f6f7U, + 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, 0x120e0e1cU, + 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, 0xa36161c2U, + 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, 0x5f35356aU, + 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, 0xf95757aeU, + 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, 0xd0b9b969U, + 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, 0x91868617U, + 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, 0x58c1c199U, + 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, 0x271d1d3aU, + 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, 0xb99e9e27U, + 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, 0x38e1e1d9U, + 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, 0x13f8f8ebU, + 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, 0xb398982bU, + 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, 0x33111122U, + 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, 0xbb6969d2U, + 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, 0x70d9d9a9U, + 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, 0x898e8e07U, + 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, 0xa7949433U, + 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, 0xb69b9b2dU, + 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, 0x221e1e3cU, + 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, 0x92878715U, + 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, 0x20e9e9c9U, + 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, 0x49cece87U, + 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, 0xff5555aaU, + 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, 0x78282850U, + 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, 0x7adfdfa5U, + 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, 0x8f8c8c03U, + 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, 0xf8a1a159U, + 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, 0x80898909U, + 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, 0x170d0d1aU, + 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, 0xdabfbf65U, + 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, 0x31e6e6d7U, + 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, 0xc6424284U, + 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, 0xb86868d0U, + 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, 0xc3414182U, + 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, 0xb0999929U, + 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, 0x772d2d5aU, + 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, 0x110f0f1eU, + 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, 0xcbb0b07bU, + 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, 0xfc5454a8U, + 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, 0xd6bbbb6dU, +0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU, 0x3a16162cU +}; static __constant__ uint32_t d_t_fn[1024] = { @@ -263,27 +525,45 @@ static __constant__ uint32_t d_t_fn[1024] = 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U, 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, - 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U -}; + 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U}; + +#define t_fn32(x) (sharedMemory[(x) * 32]) -#define t_fn0(x) (sharedMemory[ (x)]) +#define t_fn0(x) (sharedMemory[(x)]) #define t_fn1(x) (sharedMemory[256 + (x)]) #define t_fn2(x) (sharedMemory[512 + (x)]) #define t_fn3(x) (sharedMemory[768 + (x)]) +#define round(dummy, y, x, k) \ + y[0] = (k)[0] ^ t_fn0(BYTE_0(x[0])) ^ t_fn1(BYTE_1(x[1])) ^ t_fn2(BYTE_2(x[2])) ^ t_fn3(BYTE_3(x[3])); \ + y[1] = (k)[1] ^ t_fn0(BYTE_0(x[1])) ^ t_fn1(BYTE_1(x[2])) ^ t_fn2(BYTE_2(x[3])) ^ t_fn3(BYTE_3(x[0])); \ + y[2] = (k)[2] ^ t_fn0(BYTE_0(x[2])) ^ t_fn1(BYTE_1(x[3])) ^ t_fn2(BYTE_2(x[0])) ^ t_fn3(BYTE_3(x[1])); \ + y[3] = (k)[3] ^ t_fn0(BYTE_0(x[3])) ^ t_fn1(BYTE_1(x[0])) ^ t_fn2(BYTE_2(x[1])) ^ t_fn3(BYTE_3(x[2])); -#define round(dummy,y,x,k) \ - y[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \ - y[1] = (k)[1] ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \ - y[2] = (k)[2] ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \ - y[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) )); -__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey) +__device__ __forceinline__ static uint4 round32(const uint32_t* __restrict__ sharedMemory, const uint4& b, uint4 a) +{ \ + a.x ^= t_fn32(BYTE_0(b.x)); + a.y ^= t_fn32(BYTE_0(b.y)); + a.z ^= t_fn32(BYTE_0(b.z)); + a.w ^= t_fn32(BYTE_0(b.w)); + a.x ^= ROTL32_8(t_fn32(BYTE_1(b.y))); + a.y ^= ROTL32_8(t_fn32(BYTE_1(b.z))); + a.z ^= ROTL32_8(t_fn32(BYTE_1(b.w))); + a.w ^= ROTL32_8(t_fn32(BYTE_1(b.x))); + a.x ^= ROTL32_16(t_fn32(BYTE_2(b.z))) ^ ROTL32_24(t_fn32(BYTE_3(b.w))); + a.y ^= ROTL32_16(t_fn32(BYTE_2(b.w))) ^ ROTL32_24(t_fn32(BYTE_3(b.x))); + a.z ^= ROTL32_16(t_fn32(BYTE_2(b.x))) ^ ROTL32_24(t_fn32(BYTE_3(b.y))); + a.w ^= ROTL32_16(t_fn32(BYTE_2(b.y))) ^ ROTL32_24(t_fn32(BYTE_3(b.z))); + return a; +} + +__device__ __forceinline__ static void cn_aes_single_round(uint32_t* __restrict__ sharedMemory, const uint32_t* __restrict__ in, uint32_t* __restrict__ out, const uint32_t* __restrict__ expandedKey) { round(sharedMemory, out, in, expandedKey); } -__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey) +__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t* __restrict__ sharedMemory, uint32_t* __restrict__ val, const uint32_t* __restrict__ expandedKey) { uint32_t b1[4]; round(sharedMemory, b1, val, expandedKey); @@ -298,14 +578,35 @@ __device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * round(sharedMemory, val, b1, expandedKey + 9 * N_COLS); } -__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory) +__device__ __forceinline__ static uint4 cn_aes_pseudo_round_mut32(const uint32_t* __restrict__ sharedMemory, uint4 val, const uint4* __restrict__ expandedKey) +{ + uint4 b1 = round32(sharedMemory, val, *expandedKey); + val = round32(sharedMemory, b1, expandedKey[1]); + b1 = round32(sharedMemory, val, expandedKey[2]); + val = round32(sharedMemory, b1, expandedKey[3]); + b1 = round32(sharedMemory, val, expandedKey[4]); + val = round32(sharedMemory, b1, expandedKey[5]); + b1 = round32(sharedMemory, val, expandedKey[6]); + val = round32(sharedMemory, b1, expandedKey[7]); + b1 = round32(sharedMemory, val, expandedKey[8]); + val = round32(sharedMemory, b1, expandedKey[9]); + return val; +} + +__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t* sharedMemory) { for(int i = threadIdx.x; i < 1024; i += blockDim.x) sharedMemory[i] = d_t_fn[i]; } -__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory) +__device__ __forceinline__ static void cn_aes_gpu_init32(uint32_t* sharedMemory) { - for(int i = threadIdx.x; i < 512; i += blockDim.x) - sharedMemory[i] = d_t_fn[i]; + for(int i = threadIdx.x; i < 256 * 32; i += blockDim.x) + sharedMemory[i] = d_t_fn256[i]; +} + +__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t* sharedMemory) +{ + for(int i = threadIdx.x; i < 512; i += blockDim.x) + sharedMemory[i] = d_t_fn[i]; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp index 611fe1c8c..efd57c944 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp @@ -1,64 +1,68 @@ #pragma once -typedef struct { +#include "cuda_extra.hpp" + +typedef struct +{ uint32_t h[8], s[4], t[2]; uint32_t buflen; int nullt; uint8_t buf[64]; } blake_state; -#define U8TO32(p) \ +#define U8TO32(p) \ (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) - -#define U32TO8(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); - -#define BLAKE_ROT(x,n) ROTR32(x, n) -#define BLAKE_G(a,b,c,d,e) \ - v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \ - v[d] = BLAKE_ROT(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = BLAKE_ROT(v[b] ^ v[c],12); \ - v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \ - v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) + +#define U32TO8(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); \ + (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); \ + (p)[3] = (uint8_t)((v)); + +#define BLAKE_ROT(x, n) ROTR32(x, n) +#define BLAKE_G(a, b, c, d, e) \ + v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e + 1]]) + v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = BLAKE_ROT(v[b] ^ v[c], 12); \ + v[a] += (m[d_blake_sigma[i][e + 1]] ^ d_blake_cst[d_blake_sigma[i][e]]) + v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ v[b] = BLAKE_ROT(v[b] ^ v[c], 7); __constant__ uint8_t d_blake_sigma[14][16] = -{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, - {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, - {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, - {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8} -}; -__constant__ uint32_t d_blake_cst[16] -= { + { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}}; +__constant__ uint32_t d_blake_cst[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, - 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 -}; + 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917}; -__device__ void cn_blake_compress(blake_state * S, const uint8_t * block) +__device__ void cn_blake_compress(blake_state* S, const uint8_t* block) { uint32_t v[16], m[16], i; - for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); - for (i = 0; i < 8; ++i) v[i] = S->h[i]; - v[ 8] = S->s[0] ^ 0x243F6A88; - v[ 9] = S->s[1] ^ 0x85A308D3; + for(i = 0; i < 16; ++i) + m[i] = U8TO32(block + i * 4); + for(i = 0; i < 8; ++i) + v[i] = S->h[i]; + v[8] = S->s[0] ^ 0x243F6A88; + v[9] = S->s[1] ^ 0x85A308D3; v[10] = S->s[2] ^ 0x13198A2E; v[11] = S->s[3] ^ 0x03707344; v[12] = 0xA4093822; @@ -66,7 +70,7 @@ __device__ void cn_blake_compress(blake_state * S, const uint8_t * block) v[14] = 0x082EFA98; v[15] = 0xEC4E6C89; - if (S->nullt == 0) + if(S->nullt == 0) { v[12] ^= S->t[0]; v[13] ^= S->t[0]; @@ -74,50 +78,54 @@ __device__ void cn_blake_compress(blake_state * S, const uint8_t * block) v[15] ^= S->t[1]; } - for (i = 0; i < 14; ++i) + for(i = 0; i < 14; ++i) { - BLAKE_G(0, 4, 8, 12, 0); - BLAKE_G(1, 5, 9, 13, 2); - BLAKE_G(2, 6, 10, 14, 4); - BLAKE_G(3, 7, 11, 15, 6); - BLAKE_G(3, 4, 9, 14, 14); - BLAKE_G(2, 7, 8, 13, 12); - BLAKE_G(0, 5, 10, 15, 8); + BLAKE_G(0, 4, 8, 12, 0); + BLAKE_G(1, 5, 9, 13, 2); + BLAKE_G(2, 6, 10, 14, 4); + BLAKE_G(3, 7, 11, 15, 6); + BLAKE_G(3, 4, 9, 14, 14); + BLAKE_G(2, 7, 8, 13, 12); + BLAKE_G(0, 5, 10, 15, 8); BLAKE_G(1, 6, 11, 12, 10); } - for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; - for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; + for(i = 0; i < 16; ++i) + S->h[i % 8] ^= v[i]; + for(i = 0; i < 8; ++i) + S->h[i] ^= S->s[i % 4]; } -__device__ void cn_blake_update(blake_state * S, const uint8_t * data, uint64_t datalen) +__device__ void cn_blake_update(blake_state* S, const uint8_t* data, uint64_t datalen) { uint32_t left = S->buflen >> 3; uint32_t fill = 64 - left; - if (left && (((datalen >> 3) & 0x3F) >= fill)) + if(left && (((datalen >> 3) & 0x3F) >= fill)) { - memcpy((void *) (S->buf + left), (void *) data, fill); + memcpy((void*)(S->buf + left), (void*)data, fill); S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; cn_blake_compress(S, S->buf); data += fill; datalen -= (fill << 3); left = 0; } - while (datalen >= 512) + while(datalen >= 512) { S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; cn_blake_compress(S, data); data += 64; datalen -= 512; } - if (datalen > 0) + if(datalen > 0) { - memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); + memcpy((void*)(S->buf + left), (void*)data, datalen >> 3); S->buflen = (left << 3) + datalen; } else @@ -126,31 +134,32 @@ __device__ void cn_blake_update(blake_state * S, const uint8_t * data, uint64_ } } -__device__ void cn_blake_final(blake_state * S, uint8_t * digest) +__device__ void cn_blake_final(blake_state* S, uint8_t* digest) { const uint8_t padding[] = - { - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 - }; + { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; uint8_t pa = 0x81, pb = 0x01; uint8_t msglen[8]; uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; - if (lo < (unsigned) S->buflen) hi++; + if(lo < (unsigned)S->buflen) + hi++; U32TO8(msglen + 0, hi); U32TO8(msglen + 4, lo); - if (S->buflen == 440) + if(S->buflen == 440) { S->t[0] -= 8; cn_blake_update(S, &pa, 8); } else { - if (S->buflen < 440) + if(S->buflen < 440) { - if (S->buflen == 0) S->nullt = 1; + if(S->buflen == 0) + S->nullt = 1; S->t[0] -= 440 - S->buflen; cn_blake_update(S, padding, 440 - S->buflen); } @@ -168,9 +177,9 @@ __device__ void cn_blake_final(blake_state * S, uint8_t * digest) S->t[0] -= 64; cn_blake_update(S, msglen, 64); - U32TO8(digest + 0, S->h[0]); - U32TO8(digest + 4, S->h[1]); - U32TO8(digest + 8, S->h[2]); + U32TO8(digest + 0, S->h[0]); + U32TO8(digest + 4, S->h[1]); + U32TO8(digest + 8, S->h[2]); U32TO8(digest + 12, S->h[3]); U32TO8(digest + 16, S->h[4]); U32TO8(digest + 20, S->h[5]); @@ -178,17 +187,22 @@ __device__ void cn_blake_final(blake_state * S, uint8_t * digest) U32TO8(digest + 28, S->h[7]); } -__device__ void cn_blake(const uint8_t * in, uint64_t inlen, uint8_t * out) +__device__ void cn_blake(const uint8_t* in, uint64_t inlen, uint8_t* out) { blake_state bs; - blake_state *S = (blake_state *)&bs; - - S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; - S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C; - S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19; + blake_state* S = (blake_state*)&bs; + + S->h[0] = 0x6A09E667; + S->h[1] = 0xBB67AE85; + S->h[2] = 0x3C6EF372; + S->h[3] = 0xA54FF53A; + S->h[4] = 0x510E527F; + S->h[5] = 0x9B05688C; + S->h[6] = 0x1F83D9AB; + S->h[7] = 0x5BE0CD19; S->t[0] = S->t[1] = S->buflen = S->nullt = 0; S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; - cn_blake_update(S, (uint8_t *)in, inlen * 8); - cn_blake_final(S, (uint8_t *)out); + cn_blake_update(S, (uint8_t*)in, inlen * 8); + cn_blake_final(S, (uint8_t*)out); } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 718cff0c7..6c769b3e8 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -1,55 +1,55 @@ #include "xmrstak/backend/cryptonight.hpp" -#include -#include -#include +#include #include #include -#include +#include +#include +#include -#include "xmrstak/jconf.hpp" -#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" -#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" -#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp" #include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" - +#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" +#include "xmrstak/jconf.hpp" #ifdef _WIN32 #include extern "C" void compat_usleep(uint64_t waitTime) { - if (waitTime > 0) - { - if (waitTime > 100) - { - // use a waitable timer for larger intervals > 0.1ms - - HANDLE timer; - LARGE_INTEGER ft; - - ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time - - timer = CreateWaitableTimer(NULL, TRUE, NULL); - SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); - WaitForSingleObject(timer, INFINITE); - CloseHandle(timer); - } - else - { - // use a polling loop for short intervals <= 100ms - - LARGE_INTEGER perfCnt, start, now; - __int64 elapsed; - - QueryPerformanceFrequency(&perfCnt); - QueryPerformanceCounter(&start); - do { - SwitchToThread(); - QueryPerformanceCounter((LARGE_INTEGER*) &now); - elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); - } while ( elapsed < waitTime ); - } - } + if(waitTime > 0) + { + if(waitTime > 100) + { + // use a waitable timer for larger intervals > 0.1ms + + HANDLE timer; + LARGE_INTEGER ft; + + ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time + + timer = CreateWaitableTimer(NULL, TRUE, NULL); + SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); + WaitForSingleObject(timer, INFINITE); + CloseHandle(timer); + } + else + { + // use a polling loop for short intervals <= 100ms + + LARGE_INTEGER perfCnt, start, now; + __int64 elapsed; + + QueryPerformanceFrequency(&perfCnt); + QueryPerformanceCounter(&start); + do + { + SwitchToThread(); + QueryPerformanceCounter((LARGE_INTEGER*)&now); + elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); + } while(elapsed < waitTime); + } + } } #else #include @@ -60,9 +60,9 @@ extern "C" void compat_usleep(uint64_t waitTime) #endif #include "cryptonight.hpp" -#include "cuda_extra.hpp" #include "cuda_aes.hpp" #include "cuda_device.hpp" +#include "cuda_extra.hpp" /* sm_2X is limited to 2GB due to the small TLB * therefore we never use 64bit indices @@ -73,106 +73,56 @@ typedef uint64_t IndexType; typedef int IndexType; #endif -__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi ) +__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi) { - product_hi = __umul64hi( multiplier, multiplicand ); - return (multiplier * multiplicand ); -} - -template< typename T > -__device__ __forceinline__ T loadGlobal64( T * const addr ) -{ -#if (__CUDA_ARCH__ < 700) - T x; - asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) ); - return x; -#else - return *addr; -#endif -} - -template< typename T > -__device__ __forceinline__ T loadGlobal32( T * const addr ) -{ -#if (__CUDA_ARCH__ < 700) - T x; - asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) ); - return x; -#else - return *addr; -#endif -} - - -template< typename T > -__device__ __forceinline__ void storeGlobal32( T* addr, T const & val ) -{ -#if (__CUDA_ARCH__ < 700) - asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) ); -#else - *addr = val; -#endif -} - -template< typename T > -__device__ __forceinline__ void storeGlobal64( T* addr, T const & val ) -{ -#if (__CUDA_ARCH__ < 700) - asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) ); -#else - *addr = val; -#endif -} - -__device__ __forceinline__ uint32_t rotate16( const uint32_t n ) -{ - return (n >> 16u) | (n << 16u); + product_hi = __umul64hi(multiplier, multiplicand); + return (multiplier * multiplicand); } __global__ void cryptonight_core_gpu_phase1( - const uint32_t ITERATIONS, const size_t MEMORY, - int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) + const uint32_t ITERATIONS, const size_t MEMORY, + int threads, int bfactor, int partidx, uint32_t* __restrict__ long_state, uint32_t* __restrict__ ctx_state2, uint32_t* __restrict__ ctx_key1) { __shared__ uint32_t sharedMemory[1024]; - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); - const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; - const int sub = ( threadIdx.x & 7 ) << 2; + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + const int sub = (threadIdx.x & 7) << 2; const int batchsize = MEMORY >> bfactor; const int start = partidx * batchsize; const int end = start + batchsize; - if ( thread >= threads ) + if(thread >= threads) return; uint32_t key[40], text[4]; - MEMCPY8( key, ctx_key1 + thread * 40, 20 ); + MEMCPY8(key, ctx_key1 + thread * 40, 20); - if( partidx == 0 ) + if(partidx == 0) { // first round - MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 ); + MEMCPY8(text, ctx_state2 + thread * 50 + sub + 16, 2); } else { // load previous text data - MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 ); + MEMCPY8(text, &long_state[((uint64_t)thread * MEMORY) + sub + start - 32], 2); } - __syncthreads( ); - for ( int i = start; i < end; i += 32 ) + __syncthreads(); + for(int i = start; i < end; i += 32) { - cn_aes_pseudo_round_mut( sharedMemory, text, key ); - MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2); + cn_aes_pseudo_round_mut(sharedMemory, text, key); + MEMCPY8(&long_state[((uint64_t)thread * MEMORY) + (sub + i)], text, 2); } } /** avoid warning `unused parameter` */ -template< typename T > -__forceinline__ __device__ void unusedVar( const T& ) +template +__forceinline__ __device__ void unusedVar(const T&) { } @@ -189,25 +139,25 @@ __forceinline__ __device__ void unusedVar( const T& ) * @param value value to share with other threads within the group * @param src thread number within the group from where the data is read, range [0:group_n] */ -template -__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src) +template +__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src) { -#if( __CUDA_ARCH__ < 300 ) - ptr[sub] = val; - return ptr[src & (group_n-1)]; +#if(__CUDA_ARCH__ < 300) + ptr[sub] = val; + return ptr[src & (group_n - 1)]; #else - unusedVar( ptr ); - unusedVar( sub ); -# if(__CUDACC_VER_MAJOR__ >= 9) - return __shfl_sync(__activemask(), val, src, group_n ); -# else - return __shfl( val, src, group_n ); -# endif + unusedVar(ptr); + unusedVar(sub); +#if(__CUDACC_VER_MAJOR__ >= 9) + return __shfl_sync(__activemask(), val, src, group_n); +#else + return __shfl(val, src, group_n); +#endif #endif } -template -__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2) +template +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src, const uint32_t src2) { uint64_t tmp; ((uint32_t*)&tmp)[0] = shuffle(ptr, sub, val, src); @@ -218,9 +168,9 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3 struct u64 : public uint2 { - __forceinline__ __device__ u64(){} + __forceinline__ __device__ u64() {} - __forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1) + __forceinline__ __device__ u64(const uint32_t x0, const uint32_t x1) { uint2::x = x0; uint2::y = x1; @@ -231,7 +181,7 @@ struct u64 : public uint2 return *((uint64_t*)this); } - __forceinline__ __device__ u64( const uint64_t x0) + __forceinline__ __device__ u64(const uint64_t x0) { ((uint64_t*)&this->x)[0] = x0; } @@ -259,7 +209,7 @@ struct u64 : public uint2 __forceinline__ __device__ void print(int i) const { - if(i<2) + if(i < 2) printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]); } }; @@ -269,42 +219,42 @@ struct u64 : public uint2 * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory * else if `1` 256bit operations will be used */ -template +template #ifdef XMR_STAK_THREADS -__launch_bounds__( XMR_STAK_THREADS * 2 ) +__launch_bounds__(XMR_STAK_THREADS * 2) #endif -__global__ void cryptonight_core_gpu_phase2_double( - const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, - int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, - uint32_t startNonce, uint32_t * __restrict__ d_input ) + __global__ void cryptonight_core_gpu_phase2_double( + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state, + uint32_t startNonce, uint32_t* __restrict__ d_input) { __shared__ uint32_t sharedMemory[512]; - cn_aes_gpu_init_half( sharedMemory ); + cn_aes_gpu_init_half(sharedMemory); -#if( __CUDA_ARCH__ < 300 ) +#if(__CUDA_ARCH__ < 300) extern __shared__ uint64_t externShared[]; // 8 x 64bit values volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8); - volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); + volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); #else extern __shared__ uint64_t chunkMem[]; - volatile uint32_t* sPtr = NULL; + volatile uint32_t* sPtr = NULL; // 8 x 64bit values volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8); #endif - __syncthreads( ); + __syncthreads(); const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = tid >> 1; const uint32_t sub = tid & 1; - if ( thread >= threads ) + if(thread >= threads) return; - uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY]; + uint8_t* l0 = (uint8_t*)&d_long_state[(IndexType)thread * MEMORY]; uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub]; uint64_t bx0; @@ -324,22 +274,22 @@ __global__ void cryptonight_core_gpu_phase2_double( sqrt_result = (d_ctx_b + thread * 16 + 4 * 2 + 2)[0]; } else - bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub]; + bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub]; - const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + const int batchsize = (ITERATIONS * 2) >> (1 + bfactor); const int start = partidx * batchsize; const int end = start + batchsize; for(int i = start; i < end; ++i) { - ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0]; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - myChunks[x + sub] = ptr0[ x + sub ]; + myChunks[x + sub] = ptr0[x + sub]; } } else @@ -347,52 +297,51 @@ __global__ void cryptonight_core_gpu_phase2_double( uint32_t idx1 = (idx0 & 0x30) >> 3; - const u64 cx = myChunks[ idx1 + sub ]; - const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; + const u64 cx = myChunks[idx1 + sub]; + const u64 cx2 = myChunks[idx1 + ((sub + 1) & 1)]; u64 cx_aes = ax0 ^ u64( - t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )), - t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) )) - ); + t_fn0(BYTE_0(cx.x)) ^ t_fn1(BYTE_1(cx.y)) ^ ROTL32_16(t_fn0(BYTE_2(cx2.x)) ^ t_fn1(BYTE_3(cx2.y))), + t_fn0(BYTE_0(cx.y)) ^ t_fn1(BYTE_1(cx2.x)) ^ ROTL32_16(t_fn0(BYTE_2(cx2.y)) ^ t_fn1(BYTE_3(cx.x)))); if(ALGO == cryptonight_monero_v8) { - const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ]; - const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; - const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub]; + const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; + const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } else if(ALGO == cryptonight_v8_reversewaltz) { - const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ]; - const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; - const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub]; + const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; + const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } - myChunks[ idx1 + sub ] = cx_aes ^ bx0; + myChunks[idx1 + sub] = cx_aes ^ bx0; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - ptr0[ x + sub ] = myChunks[x + sub]; + ptr0[x + sub] = myChunks[x + sub]; } } else @@ -400,14 +349,14 @@ __global__ void cryptonight_core_gpu_phase2_double( idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx1 = (idx0 & 0x30) >> 3; - ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0]; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - myChunks[x + sub] = ptr0[ x + sub ]; + myChunks[x + sub] = ptr0[x + sub]; } } else @@ -417,15 +366,15 @@ __global__ void cryptonight_core_gpu_phase2_double( bx0 = cx_aes; uint64_t cx_mul; - ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); - ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); + ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x, 0); + ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y, 0); if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && sub == 1) { // Use division and square root results from the _previous_ iteration to hide the latency ((uint32_t*)&division_result)[1] ^= sqrt_result; - ((uint64_t*)myChunks)[ idx1 ] ^= division_result; + ((uint64_t*)myChunks)[idx1] ^= division_result; const uint32_t dd = (static_cast(cx_mul) + (sqrt_result << 1)) | 0x80000001UL; division_result = fast_div_v2(cx_aes, dd); @@ -433,46 +382,46 @@ __global__ void cryptonight_core_gpu_phase2_double( // Use division_result as an input for the square root to prevent parallel implementation in hardware sqrt_result = fast_sqrt_v2(cx_mul + division_result); } -#if (__CUDACC_VER_MAJOR__ >= 9) - __syncwarp(); +#if(__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ]; + uint64_t c = ((uint64_t*)myChunks)[idx1 + sub]; { - uint64_t cl = ((uint64_t*)myChunks)[ idx1 ]; + uint64_t cl = ((uint64_t*)myChunks)[idx1]; // sub 0 -> hi, sub 1 -> lo - uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; + uint64_t res = sub == 0 ? __umul64hi(cx_mul, cl) : cx_mul * cl; if(ALGO == cryptonight_monero_v8) { - const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; - uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub] ^ res; + uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; res ^= ((uint64_t*)&chunk2)[0]; - const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } if(ALGO == cryptonight_v8_reversewaltz) { - const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ] ^ res; - uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub] ^ res; + uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; res ^= ((uint64_t*)&chunk2)[0]; - const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } ax0 += res; } @@ -481,13 +430,13 @@ __global__ void cryptonight_core_gpu_phase2_double( bx1 = bx0; bx0 = cx_aes; } - myChunks[ idx1 + sub ] = ax0; + myChunks[idx1 + sub] = ax0; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - ptr0[ x + sub ] = myChunks[x + sub]; + ptr0[x + sub] = myChunks[x + sub]; } } else @@ -496,7 +445,7 @@ __global__ void cryptonight_core_gpu_phase2_double( idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); } - if ( bfactor > 0 ) + if(bfactor > 0) { ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) @@ -516,41 +465,41 @@ __global__ void cryptonight_core_gpu_phase2_double( } } -template +template #ifdef XMR_STAK_THREADS -__launch_bounds__( XMR_STAK_THREADS * 4 ) +__launch_bounds__(XMR_STAK_THREADS * 4) #endif -__global__ void cryptonight_core_gpu_phase2_quad( - const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, - int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, - uint32_t startNonce, uint32_t * __restrict__ d_input ) + __global__ void cryptonight_core_gpu_phase2_quad( + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state, + uint32_t startNonce, uint32_t* __restrict__ d_input) { __shared__ uint32_t sharedMemory[1024]; - cn_aes_gpu_init( sharedMemory ); + cn_aes_gpu_init(sharedMemory); - __syncthreads( ); + __syncthreads(); - const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2; + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; const uint32_t nonce = startNonce + thread; const int sub = threadIdx.x & 3; const int sub2 = sub & 2; -#if( __CUDA_ARCH__ < 300 ) - extern __shared__ uint32_t shuffleMem[]; - volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); +#if(__CUDA_ARCH__ < 300) + extern __shared__ uint32_t shuffleMem[]; + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFFC)); #else - volatile uint32_t* sPtr = NULL; + volatile uint32_t* sPtr = NULL; #endif - if ( thread >= threads ) + if(thread >= threads) return; int i, k; uint32_t j; - const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor ); + const int batchsize = (ITERATIONS * 2) >> (2 + bfactor); const int start = partidx * batchsize; const int end = start + batchsize; - uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; + uint32_t* long_state = &d_long_state[(IndexType)thread * MEMORY]; uint32_t a, d[2], idx0; uint32_t t1[2], t2[2], res; @@ -564,9 +513,9 @@ __global__ void cryptonight_core_gpu_phase2_quad( } uint32_t tweak1_2[2]; - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - uint32_t * state = d_ctx_state + thread * 50; + uint32_t* state = d_ctx_state + thread * 50; tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8); tweak1_2[0] ^= state[48]; tweak1_2[1] = nonce; @@ -574,7 +523,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( } a = (d_ctx_a + thread * 4)[sub]; - idx0 = shuffle<4>(sPtr,sub, a, 0); + idx0 = shuffle<4>(sPtr, sub, a, 0); if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { if(partidx != 0) @@ -585,33 +534,33 @@ __global__ void cryptonight_core_gpu_phase2_quad( } d[1] = (d_ctx_b + thread * 4)[sub]; - #pragma unroll 2 - for ( i = start; i < end; ++i ) +#pragma unroll 2 + for(i = start; i < end; ++i) { - #pragma unroll 2 - for ( int x = 0; x < 2; ++x ) +#pragma unroll 2 + for(int x = 0; x < 2; ++x) { - j = ( ( idx0 & MASK ) >> 2 ) + sub; + j = ((idx0 & MASK) >> 2) + sub; if(ALGO == cryptonight_bittube2) { uint32_t k[4]; - k[0] = ~loadGlobal32( long_state + j ); - k[1] = shuffle<4>(sPtr,sub, k[0], sub + 1); - k[2] = shuffle<4>(sPtr,sub, k[0], sub + 2); - k[3] = shuffle<4>(sPtr,sub, k[0], sub + 3); + k[0] = ~loadGlobal32(long_state + j); + k[1] = shuffle<4>(sPtr, sub, k[0], sub + 1); + k[2] = shuffle<4>(sPtr, sub, k[0], sub + 2); + k[3] = shuffle<4>(sPtr, sub, k[0], sub + 3); - #pragma unroll 4 +#pragma unroll 4 for(int i = 0; i < 4; ++i) { // only calculate the key if all data are up to date if(i == sub) { d[x] = a ^ - t_fn0( k[0] & 0xff ) ^ - t_fn1( (k[1] >> 8) & 0xff ) ^ - t_fn2( (k[2] >> 16) & 0xff ) ^ - t_fn3( (k[3] >> 24 ) ); + t_fn0(BYTE_0(k[0])) ^ + t_fn1(BYTE_1(k[1])) ^ + t_fn2(BYTE_2(k[2])) ^ + t_fn3(BYTE_3(k[3])); } // the last shuffle is not needed if(i != 3) @@ -619,13 +568,13 @@ __global__ void cryptonight_core_gpu_phase2_quad( /* avoid negative number for modulo * load valid key (k) depending on the round */ - k[(4 - sub + i)%4] = shuffle<4>(sPtr,sub, k[0] ^ d[x], i); + k[(4 - sub + i) % 4] = shuffle<4>(sPtr, sub, k[0] ^ d[x], i); } } } else { - uint32_t x_0 = loadGlobal32( long_state + j ); + uint32_t x_0 = loadGlobal32(long_state + j); if(ALGO == cryptonight_conceal) { @@ -642,18 +591,18 @@ __global__ void cryptonight_core_gpu_phase2_quad( x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old)); } - const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); - const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); - const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); + const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1); + const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2); + const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3); d[x] = a ^ - t_fn0( x_0 & 0xff ) ^ - t_fn1( (x_1 >> 8) & 0xff ) ^ - t_fn2( (x_2 >> 16) & 0xff ) ^ - t_fn3( ( x_3 >> 24 ) ); + t_fn0(BYTE_0(x_0)) ^ + t_fn1(BYTE_1(x_1)) ^ + t_fn2(BYTE_2(x_2)) ^ + t_fn3(BYTE_3(x_3)); } //XOR_BLOCKS_DST(c, b, &long_state[j]); - t1[0] = shuffle<4>(sPtr,sub, d[x], 0); + t1[0] = shuffle<4>(sPtr, sub, d[x], 0); const uint32_t z = d[0] ^ d[1]; if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) @@ -663,157 +612,178 @@ __global__ void cryptonight_core_gpu_phase2_quad( { const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2); const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; - storeGlobal32( long_state + j, sub == 2 ? fork_7 : z ); + storeGlobal32(long_state + j, sub == 2 ? fork_7 : z); } else if(ALGO == cryptonight_stellite) { const uint32_t index = ((z >> 27) & 12) | ((z >> 23) & 2); const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; - storeGlobal32( long_state + j, sub == 2 ? fork_7 : z ); + storeGlobal32(long_state + j, sub == 2 ? fork_7 : z); } } else - storeGlobal32( long_state + j, z ); + storeGlobal32(long_state + j, z); //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]); - j = ( ( *t1 & MASK ) >> 2 ) + sub; + j = ((*t1 & MASK) >> 2) + sub; uint32_t yy[2]; - *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); + *((uint64_t*)yy) = loadGlobal64(((uint64_t*)long_state) + (j >> 1)); uint32_t zz[2]; - zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); - zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); + zz[0] = shuffle<4>(sPtr, sub, yy[0], 0); + zz[1] = shuffle<4>(sPtr, sub, yy[1], 0); - t1[1] = shuffle<4>(sPtr,sub, d[x], 1); - #pragma unroll - for ( k = 0; k < 2; k++ ) - t2[k] = shuffle<4>(sPtr,sub, a, k + sub2); + t1[1] = shuffle<4>(sPtr, sub, d[x], 1); +#pragma unroll + for(k = 0; k < 2; k++) + t2[k] = shuffle<4>(sPtr, sub, a, k + sub2); - *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) ); + *((uint64_t*)t2) += sub2 ? (*((uint64_t*)t1) * *((uint64_t*)zz)) : __umul64hi(*((uint64_t*)t1), *((uint64_t*)zz)); - res = *( (uint64_t *) t2 ) >> ( sub & 1 ? 32 : 0 ); + res = *((uint64_t*)t2) >> (sub & 1 ? 32 : 0); if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res; uint32_t long_state_update = sub2 ? tweaked_res : res; - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) { - uint32_t value = shuffle<4>(sPtr,sub, long_state_update, sub & 1) ^ long_state_update; + uint32_t value = shuffle<4>(sPtr, sub, long_state_update, sub & 1) ^ long_state_update; long_state_update = sub >= 2 ? value : long_state_update; } - storeGlobal32( long_state + j, long_state_update ); + storeGlobal32(long_state + j, long_state_update); } else - storeGlobal32( long_state + j, res ); + storeGlobal32(long_state + j, res); - a = ( sub & 1 ? yy[1] : yy[0] ) ^ res; - idx0 = shuffle<4>(sPtr,sub, a, 0); + a = (sub & 1 ? yy[1] : yy[0]) ^ res; + idx0 = shuffle<4>(sPtr, sub, a, 0); if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) { - int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); - int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); + int64_t n = loadGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3)); + int32_t d = loadGlobal32((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u)); int64_t q = fast_div_heavy(n, (d | 0x5)); - if(sub&1) - storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); + if(sub & 1) + storeGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q); idx0 = d ^ q; } else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) { - int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); - int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); + int64_t n = loadGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3)); + int32_t d = loadGlobal32((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u)); int64_t q = fast_div_heavy(n, (d | 0x5)); - if(sub&1) - storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); + if(sub & 1) + storeGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q); idx0 = (~d) ^ q; } } } - if ( bfactor > 0 ) + if(bfactor > 0) { (d_ctx_a + thread * 4)[sub] = a; (d_ctx_b + thread * 4)[sub] = d[1]; if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) - if(sub&1) + if(sub & 1) *(d_ctx_b + threads * 4 + thread) = idx0; if(ALGO == cryptonight_conceal) *(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var); } } -template +template __global__ void cryptonight_core_gpu_phase3( - const uint32_t ITERATIONS, const size_t MEMORY, - int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 ) + const uint32_t ITERATIONS, const size_t MEMORY, + int threads, int bfactor, int partidx, uint32_t* long_stateIn, const uint32_t* const __restrict__ d_ctx_stateIn, uint32_t* __restrict__ d_ctx_key2) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t sharedMemoryX[256 * 32]; - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + /* avoid that the compiler is later in the aes round optimizing `sharedMemory[ x * 32 ]` to `sharedMemoryX + x * 32 + twidx`*/ + const int twidx = (threadIdx.x * 4) % 128; + // this is equivalent to `(uint32_t*)sharedMemoryX + twidx;` where `twidx` is [0;32) + char* sharedMemory = (char*)sharedMemoryX + twidx; - int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; - int subv = ( threadIdx.x & 7 ); + cn_aes_gpu_init32(sharedMemoryX); + __syncthreads(); + + int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + int subv = (threadIdx.x & 7); int sub = subv << 2; const int batchsize = MEMORY >> bfactor; const int start = (partidx % (1 << bfactor)) * batchsize; const int end = start + batchsize; - if ( thread >= threads ) + if(thread >= threads) return; + const uint32_t* const long_state = long_stateIn + ((IndexType)thread * MEMORY) + sub; + uint32_t key[40], text[4]; - MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); - MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 ); + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + ((ulonglong4*)key)[j] = ((ulonglong4*)(d_ctx_key2 + thread * 40))[j]; - __syncthreads( ); + uint64_t* d_ctx_state = (uint64_t*)(d_ctx_stateIn + thread * 50 + sub + 16); + #pragma unroll 2 + for(int j = 0; j < 2; ++j) + ((uint64_t*)text)[j] = loadGlobal64(d_ctx_state + j); + + __syncthreads(); -#if( __CUDA_ARCH__ < 300 ) +#if(__CUDA_ARCH__ < 300) extern __shared__ uint32_t shuffleMem[]; - volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8)); + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFF8)); #else volatile uint32_t* sPtr = NULL; #endif - for ( int i = start; i < end; i += 32 ) + for(int i = start; i < end; i += 32) { - #pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; + uint32_t tmp[4]; + ((ulonglong2*)(tmp))[0] = ((ulonglong2*)(long_state + i))[0]; + #pragma unroll 4 + for(int j = 0; j < 4; ++j) + text[j] ^= tmp[j]; - cn_aes_pseudo_round_mut( sharedMemory, text, key ); + ((uint4*)text)[0] = cn_aes_pseudo_round_mut32((uint32_t*)sharedMemory, ((uint4*)text)[0], (uint4*)key); if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - #pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); + uint32_t tmp[4]; + #pragma unroll 4 + for(int j = 0; j < 4; ++j) + tmp[j] = shuffle<8>(sPtr, subv, text[j], (subv + 1) & 7); + #pragma unroll 4 + for(int j = 0; j < 4; ++j) + text[j] ^= tmp[j]; } } - MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); + #pragma unroll 2 + for(int j = 0; j < 2; ++j) + storeGlobal64(d_ctx_state + j, ((uint64_t*)text)[j]); } -template +template void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo) { uint32_t MASK = algo.Mask(); uint32_t ITERATIONS = algo.Iter(); - size_t MEM = algo.Mem()/4; + size_t MEM = algo.Mem() / 4; - dim3 grid( ctx->device_blocks ); - dim3 block( ctx->device_threads ); - dim3 block2( ctx->device_threads << 1 ); - dim3 block4( ctx->device_threads << 2 ); - dim3 block8( ctx->device_threads << 3 ); + dim3 grid(ctx->device_blocks); + dim3 block(ctx->device_threads); + dim3 block2(ctx->device_threads << 1); + dim3 block4(ctx->device_threads << 2); + dim3 block8(ctx->device_threads << 3); int partcount = 1 << ctx->device_bfactor; @@ -823,27 +793,29 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo * kernel splitting if the user defined a `bfactor >= 5` */ int bfactorOneThree = ctx->device_bfactor - 4; - if( bfactorOneThree < 0 ) + if(bfactorOneThree < 0) bfactorOneThree = 0; int partcountOneThree = 1 << bfactorOneThree; - for ( int i = 0; i < partcountOneThree; i++ ) + for(int i = 0; i < partcountOneThree; i++) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( - ITERATIONS, - MEM, - ctx->device_blocks*ctx->device_threads, - bfactorOneThree, i, - ctx->d_long_state, - (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state), - ctx->d_ctx_key1 )); - - if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<>>( + ITERATIONS, + MEM, + ctx->device_blocks * ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state), + ctx->d_ctx_key1)); + + if(partcount > 1 && ctx->device_bsleep > 0) + compat_usleep(ctx->device_bsleep); } - if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + if(partcount > 1 && ctx->device_bsleep > 0) + compat_usleep(ctx->device_bsleep); - for ( int i = 0; i < partcount; i++ ) + for(int i = 0; i < partcount; i++) { if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { @@ -856,12 +828,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo block2, sizeof(uint64_t) * block.x * 8 + // shuffle memory for fermi gpus - block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( + block2.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( ITERATIONS, MEM, MASK, - ctx->device_blocks*ctx->device_threads, + ctx->device_blocks * ctx->device_threads, ctx->device_bfactor, i, ctx->d_long_state, @@ -869,28 +840,24 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo ctx->d_ctx_b, ctx->d_ctx_state, nonce, - ctx->d_input - ) - ); + ctx->d_input)); } else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r) { - int numThreads = ctx->device_blocks*ctx->device_threads; + int numThreads = ctx->device_blocks * ctx->device_threads; void* args[] = { &ITERATIONS, &MEM, &MASK, &numThreads, &ctx->device_bfactor, &i, - &ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input - }; + &ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input}; CU_CHECK(ctx->device_id, cuLaunchKernel( - ctx->kernel, - grid.x, grid.y, grid.z, - block2.x, block2.y, block2.z, - sizeof(uint64_t) * block.x * 8 + - // shuffle memory for fermi gpus - block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ), - nullptr, - args, 0 - )); + ctx->kernel, + grid.x, grid.y, grid.z, + block2.x, block2.y, block2.z, + sizeof(uint64_t) * block.x * 8 + + // shuffle memory for fermi gpus + block2.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3), + nullptr, + args, 0)); CU_CHECK(ctx->device_id, cuCtxSynchronize()); } else @@ -901,12 +868,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo cryptonight_core_gpu_phase2_quad<<< grid, block4, - block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( + block4.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( ITERATIONS, MEM, MASK, - ctx->device_blocks*ctx->device_threads, + ctx->device_blocks * ctx->device_threads, ctx->device_bfactor, i, ctx->d_long_state, @@ -914,57 +880,61 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo ctx->d_ctx_b, ctx->d_ctx_state, nonce, - ctx->d_input - ) - ); + ctx->d_input)); } - if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + if(partcount > 1 && ctx->device_bsleep > 0) + compat_usleep(ctx->device_bsleep); } int roundsPhase3 = partcountOneThree; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { // cryptonight_heavy used two full rounds over the scratchpad memory roundsPhase3 *= 2; } - for ( int i = 0; i < roundsPhase3; i++ ) + int blockSizePhase3 = block8.x; + int gridSizePhase3 = grid.x; + if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock) + { + blockSizePhase3 *= 2; + gridSizePhase3 = (gridSizePhase3 + 1) / 2; + } + for(int i = 0; i < roundsPhase3; i++) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< - grid, - block8, - block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( - ITERATIONS, - MEM, - ctx->device_blocks*ctx->device_threads, - bfactorOneThree, i, - ctx->d_long_state, - ctx->d_ctx_state, ctx->d_ctx_key2 )); + gridSizePhase3, + blockSizePhase3, + blockSizePhase3 * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( + ITERATIONS, + MEM, + ctx->device_blocks * ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2)); } } -template +template void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); const uint32_t ITERATIONS = algo.Iter(); const size_t MEM = algo.Mem(); - dim3 grid( ctx->device_blocks ); - dim3 block( ctx->device_threads ); - dim3 block2( ctx->device_threads << 1 ); - dim3 block4( ctx->device_threads << 2 ); - dim3 block8( ctx->device_threads << 3 ); + dim3 grid(ctx->device_blocks); + dim3 block(ctx->device_threads); + dim3 block2(ctx->device_threads << 1); + dim3 block4(ctx->device_threads << 2); + dim3 block8(ctx->device_threads << 3); size_t intensity = ctx->device_blocks * ctx->device_threads; CUDA_CHECK_KERNEL( ctx->device_id, - xmrstak::nvidia::cn_explode_gpu<<>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state) - ); + xmrstak::nvidia::cn_explode_gpu<<>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state)); int partcount = 1 << ctx->device_bfactor; for(int i = 0; i < partcount; i++) @@ -972,54 +942,57 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_ CUDA_CHECK_KERNEL( ctx->device_id, // 36 x 16byte x numThreads - xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu - <<device_blocks, ctx->device_threads * 16, 32 * 16 * ctx->device_threads>>> - ( - ITERATIONS, - MEM, - MASK, - (int*)ctx->d_ctx_state, - (int*)ctx->d_long_state, - ctx->device_bfactor, - i, - ctx->d_ctx_a, - ctx->d_ctx_b - ) - ); + xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu<<device_blocks, ctx->device_threads * 16, 33 * 16 * ctx->device_threads>>>( + ITERATIONS, + MEM, + MASK, + (int*)ctx->d_ctx_state, + (int*)ctx->d_long_state, + ctx->device_bfactor, + i, + ctx->d_ctx_a, + ctx->d_ctx_b)); } /* bfactor for phase 3 * * 3 consume less time than phase 2, therefore we begin with the - * kernel splitting if the user defined a `bfactor >= 5` + * kernel splitting if the user defined a `bfactor >= 8` */ - int bfactorOneThree = ctx->device_bfactor - 4; - if( bfactorOneThree < 0 ) + int bfactorOneThree = ctx->device_bfactor - 8; + if(bfactorOneThree < 0) bfactorOneThree = 0; int partcountOneThree = 1 << bfactorOneThree; int roundsPhase3 = partcountOneThree; if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || - ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { // cryptonight_heavy used two full rounds over the scratchpad memory roundsPhase3 *= 2; } - for ( int i = 0; i < roundsPhase3; i++ ) + int blockSizePhase3 = block8.x; + int gridSizePhase3 = grid.x; + if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock) + { + blockSizePhase3 *= 2; + gridSizePhase3 = (gridSizePhase3 + 1) / 2; + } + + for(int i = 0; i < roundsPhase3; i++) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< - grid, - block8, - block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( - ITERATIONS, - MEM/4, - ctx->device_blocks*ctx->device_threads, - bfactorOneThree, i, - ctx->d_long_state, - ctx->d_ctx_state, ctx->d_ctx_key2 )); + gridSizePhase3, + blockSizePhase3, + blockSizePhase3 * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( + ITERATIONS, + MEM / 4, + ctx->device_blocks * ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2)); } } @@ -1030,7 +1003,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui { if(ctx->kernel_height != chain_height || ctx->cached_algo != miner_algo) { - if(ctx->module) + if(ctx->module) cuModuleUnload(ctx->module); uint32_t PRECOMPILATION_DEPTH = 4; @@ -1045,15 +1018,16 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui ctx->kernel_height = chain_height; ctx->cached_algo = miner_algo; - for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i) + for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i) xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height + i, PRECOMPILATION_DEPTH, ctx->device_arch[0], ctx->device_arch[1], true); } } - typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo); + typedef void (*cuda_hash_fn)(nvid_ctx * ctx, uint32_t nonce, const xmrstak_algo& algo); - if(miner_algo == invalid_algo) return; + if(miner_algo == invalid_algo) + return; static const cuda_hash_fn func_table[] = { cryptonight_core_gpu_hash, @@ -1105,13 +1079,11 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash - }; + cryptonight_core_gpu_hash}; std::bitset<1> digit; digit.set(0, ctx->memMode == 1); - cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; + cuda_hash_fn selected_function = func_table[((miner_algo - 1u) << 1) | digit.to_ulong()]; selected_function(ctx, startNonce, miner_algo); - } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp index fee7e13d1..516d4ca00 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -1,11 +1,63 @@ #pragma once +#include #include #include -#include -#include "cuda_keccak.hpp" #include "cuda_extra.hpp" +#include "cuda_keccak.hpp" + +template +__device__ __forceinline__ T loadGlobal64(T* const addr) +{ +#if(__CUDA_ARCH__ < 700) + T x; + asm volatile("ld.global.cg.u64 %0, [%1];" + : "=l"(x) + : "l"(addr)); + return x; +#else + return *addr; +#endif +} + +template +__device__ __forceinline__ T loadGlobal32(T* const addr) +{ +#if(__CUDA_ARCH__ < 700) + T x; + asm volatile("ld.global.cg.u32 %0, [%1];" + : "=r"(x) + : "l"(addr)); + return x; +#else + return *addr; +#endif +} + +template +__device__ __forceinline__ void storeGlobal32(T* addr, T const& val) +{ +#if(__CUDA_ARCH__ < 700) + asm volatile("st.global.cg.u32 [%0], %1;" + : + : "l"(addr), "r"(val)); +#else + *addr = val; +#endif +} + +template +__device__ __forceinline__ void storeGlobal64(T* addr, T const& val) +{ +#if(__CUDA_ARCH__ < 700) + asm volatile("st.global.cg.u64 [%0], %1;" + : + : "l"(addr), "l"(val)); +#else + *addr = val; +#endif +} namespace xmrstak { @@ -15,7 +67,7 @@ namespace nvidia struct __m128i : public int4 { - __forceinline__ __device__ __m128i(){} + __forceinline__ __device__ __m128i() {} __forceinline__ __device__ __m128i( const uint32_t x0, const uint32_t x1, @@ -27,7 +79,7 @@ struct __m128i : public int4 w = x3; } - __forceinline__ __device__ __m128i( const int x0) + __forceinline__ __device__ __m128i(const int x0) { x = x0; y = x0; @@ -41,8 +93,7 @@ struct __m128i : public int4 x | other.x, y | other.y, z | other.z, - w | other.w - ); + w | other.w); } __forceinline__ __device__ __m128i operator^(const __m128i& other) @@ -51,15 +102,14 @@ struct __m128i : public int4 x ^ other.x, y ^ other.y, z ^ other.z, - w ^ other.w - ); + w ^ other.w); } }; struct __m128 : public float4 { - __forceinline__ __device__ __m128(){} + __forceinline__ __device__ __m128() {} __forceinline__ __device__ __m128( const float x0, const float x1, @@ -71,7 +121,7 @@ struct __m128 : public float4 float4::w = x3; } - __forceinline__ __device__ __m128( const float x0) + __forceinline__ __device__ __m128(const float x0) { float4::x = x0; float4::y = x0; @@ -79,7 +129,7 @@ struct __m128 : public float4 float4::w = x0; } - __forceinline__ __device__ __m128( const __m128i& x0) + __forceinline__ __device__ __m128(const __m128i& x0) { float4::x = int2float(x0.x); float4::y = int2float(x0.y); @@ -87,14 +137,13 @@ struct __m128 : public float4 float4::w = int2float(x0.w); } - __forceinline__ __device__ __m128i get_int( ) + __forceinline__ __device__ __m128i get_int() { return __m128i( (int)x, (int)y, (int)z, - (int)w - ); + (int)w); } __forceinline__ __device__ __m128 operator+(const __m128& other) @@ -103,8 +152,7 @@ struct __m128 : public float4 x + other.x, y + other.y, z + other.z, - w + other.w - ); + w + other.w); } __forceinline__ __device__ __m128 operator-(const __m128& other) @@ -113,8 +161,7 @@ struct __m128 : public float4 x - other.x, y - other.y, z - other.z, - w - other.w - ); + w - other.w); } __forceinline__ __device__ __m128 operator*(const __m128& other) @@ -123,8 +170,7 @@ struct __m128 : public float4 x * other.x, y * other.y, z * other.z, - w * other.w - ); + w * other.w); } __forceinline__ __device__ __m128 operator/(const __m128& other) @@ -133,67 +179,64 @@ struct __m128 : public float4 x / other.x, y / other.y, z / other.z, - w / other.w - ); + w / other.w); } __forceinline__ __device__ __m128& trunc() { - x=::truncf(x); - y=::truncf(y); - z=::truncf(z); - w=::truncf(w); + x = ::truncf(x); + y = ::truncf(y); + z = ::truncf(z); + w = ::truncf(w); return *this; } __forceinline__ __device__ __m128& abs() { - x=::fabsf(x); - y=::fabsf(y); - z=::fabsf(z); - w=::fabsf(w); + x = ::fabsf(x); + y = ::fabsf(y); + z = ::fabsf(z); + w = ::fabsf(w); return *this; } __forceinline__ __device__ __m128& floor() { - x=::floorf(x); - y=::floorf(y); - z=::floorf(z); - w=::floorf(w); + x = ::floorf(x); + y = ::floorf(y); + z = ::floorf(z); + w = ::floorf(w); return *this; } }; - -template +template __device__ void print(const char* name, T value) { printf("g %s: ", name); for(int i = 0; i < 4; ++i) { - printf("%08X ",((uint32_t*)&value)[i]); + printf("%08X ", ((uint32_t*)&value)[i]); } printf("\n"); } -template<> +template <> __device__ void print<__m128>(const char* name, __m128 value) { printf("g %s: ", name); for(int i = 0; i < 4; ++i) { - printf("%f ",((float*)&value)[i]); + printf("%f ", ((float*)&value)[i]); } printf("\n"); } #define SHOW(name) print(#name, name) - __forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b) { return a + b; @@ -220,8 +263,7 @@ __forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b) int_as_float(float_as_int(a.x) & b), int_as_float(float_as_int(a.y) & b), int_as_float(float_as_int(a.z) & b), - int_as_float(float_as_int(a.w) & b) - ); + int_as_float(float_as_int(a.w) & b)); } __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) @@ -230,8 +272,7 @@ __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) int_as_float(float_as_int(a.x) | b), int_as_float(float_as_int(a.y) | b), int_as_float(float_as_int(a.z) | b), - int_as_float(float_as_int(a.w) | b) - ); + int_as_float(float_as_int(a.w) | b)); } __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) @@ -240,20 +281,18 @@ __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) int_as_float(float_as_int(a.x) ^ b), int_as_float(float_as_int(a.y) ^ b), int_as_float(float_as_int(a.z) ^ b), - int_as_float(float_as_int(a.w) ^ b) - ); + int_as_float(float_as_int(a.w) ^ b)); } __forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc) { __m128 d(dc); __m128 c = _mm_div_ps(v, d); - c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); + c.trunc(); //_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2 c = _mm_mul_ps(c, d); return _mm_sub_ps(v, c); - //return a.fmodf(b); } @@ -262,23 +301,20 @@ __forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b) return a ^ b; } - __forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot) { const uint32_t right = 8 * rot; const uint32_t left = (32 - 8 * rot); return __m128i( - ((uint32_t)a.x >> right) | ( a.y << left ), - ((uint32_t)a.y >> right) | ( a.z << left ), - ((uint32_t)a.z >> right) | ( a.w << left ), - ((uint32_t)a.w >> right) | ( a.x << left ) - ); + ((uint32_t)a.x >> right) | (a.y << left), + ((uint32_t)a.y >> right) | (a.z << left), + ((uint32_t)a.z >> right) | (a.w << left), + ((uint32_t)a.w >> right) | (a.x << left)); } -__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); } +__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int* lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); } - -__forceinline__ __device__ __m128 fma_break(__m128 x) +__forceinline__ __device__ __m128 fma_break(__m128 x) { // Break the dependency chain by setitng the exp to ?????01 x = _mm_and_ps(x, 0xFEFFFFFF); @@ -290,13 +326,13 @@ __forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m12 { n1 = _mm_add_ps(n1, c); __m128 nn = _mm_mul_ps(n0, c); - nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn)); nn = fma_break(nn); n = _mm_add_ps(n, nn); n3 = _mm_sub_ps(n3, c); __m128 dd = _mm_mul_ps(n2, c); - dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd)); dd = fma_break(dd); d = _mm_add_ps(d, dd); @@ -326,7 +362,7 @@ __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, _ // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 d = _mm_and_ps(d, 0xFF7FFFFF); d = _mm_or_ps(d, 0x40000000); - r =_mm_add_ps(r, _mm_div_ps(n,d)); + r = _mm_add_ps(r, _mm_div_ps(n, d)); } // 74*8 = 595 @@ -335,15 +371,14 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n __m128 c(cnt); // 35 maths calls follow (140 FLOPS) __m128 r = __m128(0.0f); - for(int i=0; i< 4; ++i) + for(int i = 0; i < 4; ++i) round_compute(n0, n1, n2, n3, rnd_c, c, r); // do a quick fmod by setting exp to 2 r = _mm_and_ps(r, 0x807FFFFF); r = _mm_or_ps(r, 0x40000000); - sum = r; // 34 + sum = r; // 34 r = _mm_mul_ps(r, __m128(536870880.0f)); // 35 return r.get_int(); - } __forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) @@ -376,8 +411,7 @@ __constant__ uint32_t look[16][4] = { {3, 1, 2, 0}, {3, 2, 0, 1}, {3, 0, 1, 2}, - {3, 0, 2, 1} -}; + {3, 0, 2, 1}}; __constant__ float ccnt[16] = { 1.34375f, @@ -398,31 +432,30 @@ __constant__ float ccnt[16] = { 1.3203125f, 1.3515625f, 1.3359375f, - 1.4609375f -}; - + 1.4609375f}; __forceinline__ __device__ void sync() { -#if (__CUDACC_VER_MAJOR__ >= 9) +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif } struct SharedMemChunk { __m128i out[16]; - __m128 va[16]; + __m128 va[17]; }; +__launch_bounds__(128, 8) __global__ void cryptonight_core_gpu_phase2_gpu( - const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, - int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int32_t* spad, int* lpad_in, int bfactor, int partidx, uint32_t* roundVs, uint32_t* roundS) { - const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + const int batchsize = (ITERATIONS * 2) >> (1 + bfactor); extern __shared__ SharedMemChunk smemExtern_in[]; @@ -435,7 +468,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu( uint32_t tid = threadIdx.x % 16; - const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16; + const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x / 16; uint32_t s = 0; __m128 vs(0); @@ -455,10 +488,10 @@ __global__ void cryptonight_core_gpu_phase2_gpu( const uint32_t tidm = tid % 4; const uint32_t block = tidd * 16 + tidm; - for(size_t i = 0; i < batchsize; i++) + for(int i = 0; i < batchsize; i++) { sync(); - int tmp = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm]; + int tmp = loadGlobal32( ((int*)scratchpad_ptr(s, tidd, lpad, MASK)) + tidm ); ((int*)smem->out)[tid] = tmp; sync(); @@ -470,8 +503,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu( *(smem->out + look[tid][2]), *(smem->out + look[tid][3]), ccnt[tid], rc, smem->va[tid], - smem->out[tid] - ); + smem->out[tid]); sync(); @@ -479,11 +511,11 @@ __global__ void cryptonight_core_gpu_phase2_gpu( for(uint32_t dd = block + 4; dd < (tidd + 1) * 16; dd += 4) outXor ^= ((int*)smem->out)[dd]; - ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ tmp; + storeGlobal32( ((int*)scratchpad_ptr(s, tidd, lpad, MASK)) + tidm, outXor ^ tmp ); ((int*)smem->out)[tid] = outXor; float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4]; - float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12]; + float va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12]; ((float*)smem->va)[tid] = va_tmp1 + va_tmp2; sync(); @@ -505,10 +537,10 @@ __global__ void cryptonight_core_gpu_phase2_gpu( vs = _mm_div_ps(vs, __m128(64.0f)); s = out2.x ^ out2.y ^ out2.z ^ out2.w; } - if(partidx != ((1<(spad + i); - sync(); + if(blockDim.x > 32) + __syncthreads(); + else + sync(); - for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x) + for(uint64_t i = threadIdx.x; i < MEMORY / 512; i += blockDim.x) { - generate_512(i, state, (uint8_t*)lpad + i*512); + generate_512(i, state, (uint8_t*)lpad + i * 512); } } -} // namespace xmrstak } // namespace nvidia +} // namespace xmrstak diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt index bcf495080..214114c7e 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt @@ -462,10 +462,10 @@ __global__ void CryptonightR_phase2( uint64_t bx0 = ((uint64_t*)(d_ctx_b + thread * 16))[sub]; uint64_t bx1 = ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub]; - uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2]; - uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1]; - uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2]; - uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3]; + volatile uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2]; + volatile uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1]; + volatile uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2]; + volatile uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3]; const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); const int start = partidx * batchsize; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp index 96cb679f5..48ebe4bd7 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp @@ -2,8 +2,8 @@ #pragma once #include -#include #include +#include #include /** execute and check a CUDA api command @@ -12,27 +12,30 @@ * @param msg message string which should be added to the error message * @param ... CUDA api command */ -#define CUDA_CHECK_MSG(id, msg, ...) { \ - cudaError_t error = __VA_ARGS__; \ - if(error!=cudaSuccess){ \ - std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \ - std::cerr << msg << std::endl; \ - throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \ - } \ -} \ -( (void) 0 ) - -#define CU_CHECK(id, ...) { \ - CUresult result = __VA_ARGS__; \ - if(result != CUDA_SUCCESS){ \ - const char* s; \ - cuGetErrorString(result, &s); \ - std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \ - throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \ - } \ -} \ -( (void) 0 ) +#define CUDA_CHECK_MSG(id, msg, ...) \ + { \ + cudaError_t error = __VA_ARGS__; \ + if(error != cudaSuccess) \ + { \ + std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \ + std::cerr << msg << std::endl; \ + throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \ + } \ + } \ + ((void)0) +#define CU_CHECK(id, ...) \ + { \ + CUresult result = __VA_ARGS__; \ + if(result != CUDA_SUCCESS) \ + { \ + const char* s; \ + cuGetErrorString(result, &s); \ + std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \ + throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \ + } \ + } \ + ((void)0) /** execute and check a CUDA api command * @@ -47,7 +50,7 @@ * @param ... CUDA kernel call */ #define CUDA_CHECK_KERNEL(id, ...) \ - __VA_ARGS__; \ + __VA_ARGS__; \ CUDA_CHECK(id, cudaGetLastError()) /** execute and check a CUDA kernel @@ -57,5 +60,5 @@ * @param ... CUDA kernel call */ #define CUDA_CHECK_MSG_KERNEL(id, msg, ...) \ - __VA_ARGS__; \ + __VA_ARGS__; \ CUDA_CHECK_MSG(id, msg, cudaGetLastError()) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index b6e41c619..d5b292cb4 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -1,83 +1,80 @@ -#include -#include -#include -#include +#include "xmrstak/jconf.hpp" #include -#include #include #include -#include -#include "xmrstak/jconf.hpp" - +#include +#include +#include +#include +#include typedef unsigned char BitSequence; typedef unsigned long long DataLength; -#include "xmrstak/backend/cryptonight.hpp" #include "cryptonight.hpp" -#include "cuda_extra.hpp" -#include "cuda_keccak.hpp" +#include "cuda_aes.hpp" #include "cuda_blake.hpp" +#include "cuda_device.hpp" +#include "cuda_extra.hpp" #include "cuda_groestl.hpp" #include "cuda_jh.hpp" +#include "cuda_keccak.hpp" #include "cuda_skein.hpp" -#include "cuda_device.hpp" -#include "cuda_aes.hpp" +#include "xmrstak/backend/cryptonight.hpp" -__constant__ uint8_t d_sub_byte[16][16] ={ - {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 }, - {0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 }, - {0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 }, - {0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 }, - {0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 }, - {0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf }, - {0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 }, - {0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 }, - {0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 }, - {0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb }, - {0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 }, - {0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 }, - {0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a }, - {0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e }, - {0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf }, - {0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 } -}; - -__device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ key, const uint32_t * __restrict__ data ) +__constant__ uint8_t d_sub_byte[16][16] = { + {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76}, + {0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0}, + {0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15}, + {0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75}, + {0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84}, + {0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf}, + {0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8}, + {0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2}, + {0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73}, + {0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb}, + {0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79}, + {0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08}, + {0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a}, + {0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e}, + {0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf}, + {0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}}; + +__device__ __forceinline__ void cryptonight_aes_set_key(uint32_t* __restrict__ key, const uint32_t* __restrict__ data) { int i, j; uint8_t temp[4]; - const uint32_t aes_gf[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; + const uint32_t aes_gf[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36}; - MEMSET4( key, 0, 40 ); - MEMCPY4( key, data, 8 ); + MEMSET4(key, 0, 40); + MEMCPY4(key, data, 8); #pragma unroll - for ( i = 8; i < 40; i++ ) + for(i = 8; i < 40; i++) { - *(uint32_t *) temp = key[i - 1]; - if ( i % 8 == 0 ) + *(uint32_t*)temp = key[i - 1]; + if(i % 8 == 0) { - *(uint32_t *) temp = ROTR32( *(uint32_t *) temp, 8 ); - for ( j = 0; j < 4; j++ ) - temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f]; - *(uint32_t *) temp ^= aes_gf[i / 8 - 1]; + *(uint32_t*)temp = ROTR32(*(uint32_t*)temp, 8); + for(j = 0; j < 4; j++) + temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f]; + *(uint32_t*)temp ^= aes_gf[i / 8 - 1]; } else { - if ( i % 8 == 4 ) + if(i % 8 == 4) { #pragma unroll - for ( j = 0; j < 4; j++ ) - temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f]; + for(j = 0; j < 4; j++) + temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f]; } } - key[i] = key[( i - 8 )] ^ *(uint32_t *) temp; + key[i] = key[(i - 8)] ^ *(uint32_t*)temp; } } -__device__ __forceinline__ void mix_and_propagate( uint32_t* state ) +__device__ __forceinline__ void mix_and_propagate(uint32_t* state) { uint32_t tmp0[4]; for(size_t x = 0; x < 4; ++x) @@ -93,18 +90,18 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state ) (state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x]; } -template -__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 ) +template +__global__ void cryptonight_extra_gpu_prepare(int threads, uint32_t* __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_state2, uint32_t* __restrict__ d_ctx_a, uint32_t* __restrict__ d_ctx_b, uint32_t* __restrict__ d_ctx_key1, uint32_t* __restrict__ d_ctx_key2) { - int thread = ( blockDim.x * blockIdx.x + threadIdx.x ); + int thread = (blockDim.x * blockIdx.x + threadIdx.x); __shared__ uint32_t sharedMemory[1024]; if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); } - if ( thread >= threads ) + if(thread >= threads) return; uint32_t ctx_state[50]; @@ -114,29 +111,29 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric uint32_t ctx_key2[40]; uint32_t input[32]; - memcpy( input, d_input, len ); + memcpy(input, d_input, len); //*((uint32_t *)(((char *)input) + 39)) = startNonce + thread; uint32_t nonce = startNonce + thread; - for ( int i = 0; i < sizeof (uint32_t ); ++i ) - ( ( (char *) input ) + 39 )[i] = ( (char*) ( &nonce ) )[i]; //take care of pointer alignment + for(int i = 0; i < sizeof(uint32_t); ++i) + (((char*)input) + 39)[i] = ((char*)(&nonce))[i]; //take care of pointer alignment - cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state ); - cryptonight_aes_set_key( ctx_key1, ctx_state ); - cryptonight_aes_set_key( ctx_key2, ctx_state + 8 ); + cn_keccak((uint8_t*)input, len, (uint8_t*)ctx_state); + cryptonight_aes_set_key(ctx_key1, ctx_state); + cryptonight_aes_set_key(ctx_key2, ctx_state + 8); - XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); - XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); - memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); + XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a); + XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b); + memcpy(d_ctx_a + thread * 4, ctx_a, 4 * 4); if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { - memcpy( d_ctx_b + thread * 16, ctx_b, 4 * 4 ); + memcpy(d_ctx_b + thread * 16, ctx_b, 4 * 4); // bx1 - XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); - memcpy( d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4 ); + XOR_BLOCKS_DST(ctx_state + 16, ctx_state + 20, ctx_b); + memcpy(d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4); // division_result - memcpy( d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2 ); + memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2); // sqrt_result - memcpy( d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + memcpy(d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2); } else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r) { @@ -148,31 +145,31 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 8); } else - memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + memcpy(d_ctx_b + thread * 4, ctx_b, 4 * 4); - memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); - memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); - memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 ); + memcpy(d_ctx_key1 + thread * 40, ctx_key1, 40 * 4); + memcpy(d_ctx_key2 + thread * 40, ctx_key2, 40 * 4); + memcpy(d_ctx_state + thread * 50, ctx_state, 50 * 4); if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - for(int i=0; i < 16; i++) + for(int i = 0; i < 16; i++) { for(size_t t = 4; t < 12; ++t) { - cn_aes_pseudo_round_mut( sharedMemory, ctx_state + 4u * t, ctx_key1 ); + cn_aes_pseudo_round_mut(sharedMemory, ctx_state + 4u * t, ctx_key1); } // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values mix_and_propagate(ctx_state + 4 * 4); } // double buffer to move manipulated state into phase1 - memcpy( d_ctx_state2 + thread * 50, ctx_state, 50 * 4 ); + memcpy(d_ctx_state2 + thread * 50, ctx_state, 50 * 4); } } -template -__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 ) +template +__global__ void cryptonight_extra_gpu_final(int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t* __restrict__ d_res_nonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_key2) { const int thread = blockDim.x * blockIdx.x + threadIdx.x; @@ -181,19 +178,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); } - if ( thread >= threads ) + if(thread >= threads) return; int i; - uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50; + uint32_t* __restrict__ ctx_state = d_ctx_state + thread * 50; uint64_t hash[4]; uint32_t state[50]; - #pragma unroll - for ( i = 0; i < 50; i++ ) +#pragma unroll + for(i = 0; i < 50; i++) state[i] = ctx_state[i]; if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || @@ -202,25 +199,25 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 uint32_t key[40]; // load keys - MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); + MEMCPY8(key, d_ctx_key2 + thread * 40, 20); - for(int i=0; i < 16; i++) + for(int i = 0; i < 16; i++) { for(size_t t = 4; t < 12; ++t) { - cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key ); + cn_aes_pseudo_round_mut(sharedMemory, state + 4u * t, key); } // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values mix_and_propagate(state + 4 * 4); } } - cn_keccakf2( (uint64_t *) state ); + cn_keccakf2((uint64_t*)state); if(ALGO == cryptonight_gpu) { - if ( ((uint64_t*)state)[3] < target ) + if(((uint64_t*)state)[3] < target) { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF); if(idx < 10) d_res_nonce[idx] = thread; @@ -228,19 +225,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 } else { - switch ( ( (uint8_t *) state )[0] & 0x03 ) + switch(((uint8_t*)state)[0] & 0x03) { case 0: - cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); + cn_blake((const uint8_t*)state, 200, (uint8_t*)hash); break; case 1: - cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); + cn_groestl((const BitSequence*)state, 200, (BitSequence*)hash); break; case 2: - cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); + cn_jh((const BitSequence*)state, 200, (BitSequence*)hash); break; case 3: - cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); + cn_skein((const BitSequence*)state, 200, (BitSequence*)hash); break; default: break; @@ -249,9 +246,9 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values // and expect an accurate result for target > 32-bit without implementing carries - if ( hash[3] < target ) + if(hash[3] < target) { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF); if(idx < 10) d_res_nonce[idx] = thread; @@ -259,10 +256,10 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 } } -extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len ) +extern "C" void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len) { ctx->inputlen = len; - CUDA_CHECK(ctx->device_id, cudaMemcpy( ctx->d_input, data, len, cudaMemcpyHostToDevice )); + CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, data, len, cudaMemcpyHostToDevice)); } extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) @@ -290,7 +287,6 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) case 3: CUDA_CHECK(ctx->device_id, cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); break; - }; // prefer shared memory over L1 cache @@ -314,8 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || - std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() - ) + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()) { // extent ctx_b to hold the state of idx0 ctx_b_size += sizeof(uint32_t) * wsize; @@ -326,16 +321,14 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) { ctx_b_size += sizeof(uint32_t) * 4 * wsize; } - else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) - || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end())) + else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end())) { // bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte) ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize; } else if( std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() || - std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end() - ) + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end()) { // bx0 (16byte), bx1 (16byte), and [r0, r1, r2, r3] (a 8byte) ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize; @@ -349,9 +342,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size)); // POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof (uint32_t ) )); - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) )); - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof(uint32_t))); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof(uint32_t))); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof(uint32_t))); CUDA_CHECK_MSG( ctx->device_id, "\n**suggestion: Try to reduce the value of the attribute 'threads' in the NVIDIA config file.**", @@ -364,106 +357,102 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; - dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock ); - dim3 block( threadsperblock ); + dim3 grid((wsize + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); if(miner_algo == cryptonight_heavy) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_haven) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_superfast) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_bittube2) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_monero_v8) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_gpu) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_r) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_r_wow) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_v8_reversewaltz) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else { /* pass two times d_ctx_state because the second state is used later in phase1, * the first is used than in phase3 */ - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } } -extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo) +extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo) { int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; - dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock ); - dim3 block( threadsperblock ); + dim3 grid((wsize + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); - CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) )); - CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0xFF, 10 * sizeof(uint32_t))); + CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_count, 0, sizeof(uint32_t))); if(miner_algo == cryptonight_heavy) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_haven) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_superfast) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_gpu) { @@ -471,8 +460,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else { @@ -480,16 +468,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } - CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost )); + CUDA_CHECK(ctx->device_id, cudaMemcpy(rescount, ctx->d_result_count, sizeof(uint32_t), cudaMemcpyDeviceToHost)); CUDA_CHECK_MSG( ctx->device_id, "\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**", - cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ) - ); + cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); /* There is only a 32bit limit for the counter on the device side * therefore this value can be greater than 10, in that case limit rescount @@ -497,11 +483,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, */ if(*rescount > 10) *rescount = 10; - for(int i=0; i < *rescount; i++) + for(int i = 0; i < *rescount; i++) resnonce[i] += startNonce; } -extern "C" int cuda_get_devicecount( int* deviceCount) +extern "C" int cuda_get_devicecount(int* deviceCount) { cudaError_t err; *deviceCount = 0; @@ -574,12 +560,13 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->device_mpcount = props.multiProcessorCount; ctx->device_arch[0] = props.major; ctx->device_arch[1] = props.minor; + ctx->device_maxThreadsPerBlock = props.maxThreadsPerBlock; const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1]; ctx->name = std::string(props.name); - printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ", + printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"...\n", version / 1000, (version % 1000 / 10), CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10, ctx->device_id, gpuArch, ctx->device_name); @@ -587,17 +574,17 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) std::vector arch; #define XMRSTAK_PP_TOSTRING1(str) #str #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str) - char const * archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST); + char const* archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST); #undef XMRSTAK_PP_TOSTRING #undef XMRSTAK_PP_TOSTRING1 std::stringstream ss(archStringList); //transform string list separated with `+` into a vector of integers int tmpArch; - while ( ss >> tmpArch ) - arch.push_back( tmpArch ); + while(ss >> tmpArch) + arch.push_back(tmpArch); - #define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n" +#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n" if(gpuArch >= 20 && gpuArch < 30) { // compiled binary must support sm_20 for fermi @@ -618,7 +605,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * with a sm_20 only compiled binary */ for(int i = 0; i < arch.size(); ++i) - if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch)) + if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch)) minSupportedArch = arch[i]; if(minSupportedArch < 30 || gpuArch < minSupportedArch) { @@ -630,7 +617,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); - // set all device option those marked as auto (-1) to a valid value if(ctx->device_blocks == -1) { @@ -648,6 +634,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) if(props.multiProcessorCount <= 6) ctx->device_bfactor += 2; } + + // for the most algorithms we are using 8 threads per hash + uint32_t threadsPerHash = 8; + if(ctx->device_threads == -1) { /* sm_20 devices can only run 512 threads per cuda block @@ -656,9 +646,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) */ const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024; - // for the most algorithms we are using 8 threads per hash - uint32_t threadsPerHash = 8; - // phase2_gpu uses 16 threads per hash if(useCryptonight_gpu) threadsPerHash = 16; @@ -700,7 +687,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) int* tmp; cudaError_t err; - #define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n" +#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n" // a device must be selected to get the right memory usage later on err = cudaSetDevice(ctx->device_id); if(err != cudaSuccess) @@ -716,7 +703,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) return 3; } - size_t freeMemory = 0; size_t totalMemory = 0; CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory)); @@ -746,7 +732,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) size_t usedMem = totalMemory - freeMemory; if(usedMem >= maxMemUsage) { - printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str()); + printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem / byteToMiB).c_str()); return 4; } else @@ -764,8 +750,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || - std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() - ) + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()) perThread += 50 * 4; // state double buffer size_t max_intensity = limitedMemory / perThread; @@ -805,22 +790,30 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) // 8 is chosen by checking the occupancy calculator size_t blockOptimal = 8 * ctx->device_mpcount; + if(gpuArch == 30) + blockOptimal = 8 * ctx->device_mpcount; // the following values are calculated with CUDA10 and the occupancy calculator - if(gpuArch == 35 || gpuArch/10 == 5 || gpuArch/10 == 6) - blockOptimal = 7 * ctx->device_mpcount; + if(gpuArch == 35 || gpuArch / 10 == 5 || gpuArch / 10 == 6) + blockOptimal = 7 * ctx->device_mpcount; if(gpuArch == 37) - blockOptimal = 14 * ctx->device_mpcount; + blockOptimal = 14 * ctx->device_mpcount; if(gpuArch >= 70) - blockOptimal = 6 * ctx->device_mpcount; + blockOptimal = 6 * ctx->device_mpcount; if(blockOptimal * threads * hashMemSize < limitedMemory) - { - ctx->device_threads = threads; ctx->device_blocks = blockOptimal; - } - + else + ctx->device_blocks = limitedMemory / hashMemSize / threads; // round to a memory fitting value + ctx->device_threads = threads; } } + + if(ctx->device_threads * threadsPerHash > ctx->device_maxThreadsPerBlock) + { + // by default cryptonight CUDA implementations uses 8 threads per thread for some kernel + ctx->device_threads = ctx->device_maxThreadsPerBlock / threadsPerHash; + printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads); + } printf("device init succeeded\n"); return 0; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp index 4d369f843..ec7e3e0a4 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp @@ -2,31 +2,13 @@ #include "xmrstak/backend/cryptonight.hpp" -#ifdef __INTELLISENSE__ -#define __CUDA_ARCH__ 520 -/* avoid red underlining */ - -struct uint3 -{ - unsigned int x, y, z; -}; - -struct uint3 threadIdx; -struct uint3 blockIdx; -struct uint3 blockDim; -#define __funnelshift_r(a,b,c) 1 -#define __syncthreads() -#define asm(x) -#define __shfl(a,b,c) 1 -#endif - -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 -#define INIT_SIZE_BLK 8 +#define AES_BLOCK_SIZE 16 +#define AES_KEY_SIZE 32 +#define INIT_SIZE_BLK 8 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B -#define C32(x) ((uint32_t)(x ## U)) -#define T32(x) ((x) & C32(0xFFFFFFFF)) +#define C32(x) ((uint32_t)(x##U)) +#define T32(x) ((x)&C32(0xFFFFFFFF)) #if __CUDA_ARCH__ >= 350 __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) @@ -34,71 +16,112 @@ __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int uint2 result; if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); } - return __double_as_longlong(__hiloint2double(result.y, result.x)); + return __double_as_longlong(__hiloint2double(result.y, result.x)); } -#define ROTL64(x, n) (cuda_ROTL64(x, n)) + +# define ROTL64(x, n) (cuda_ROTL64(x, n)) #else -#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +# define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #endif #if __CUDA_ARCH__ < 350 #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #else -#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) -#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) +#define ROTL32(x, n) __funnelshift_l((x), (x), (n)) +#define ROTR32(x, n) __funnelshift_r((x), (x), (n)) #endif -#define MEMSET8(dst,what,cnt) { \ - int i_memset8; \ - uint64_t *out_memset8 = (uint64_t *)(dst); \ - for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \ - out_memset8[i_memset8] = (what); } - -#define MEMSET4(dst,what,cnt) { \ - int i_memset4; \ - uint32_t *out_memset4 = (uint32_t *)(dst); \ - for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \ - out_memset4[i_memset4] = (what); } - -#define MEMCPY8(dst,src,cnt) { \ - int i_memcpy8; \ - uint64_t *in_memcpy8 = (uint64_t *)(src); \ - uint64_t *out_memcpy8 = (uint64_t *)(dst); \ - for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \ - out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; } - -#define MEMCPY4(dst,src,cnt) { \ - int i_memcpy4; \ - uint32_t *in_memcpy4 = (uint32_t *)(src); \ - uint32_t *out_memcpy4 = (uint32_t *)(dst); \ - for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \ - out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; } - -#define XOR_BLOCKS(a,b) { \ - ((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \ - ((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; } - -#define XOR_BLOCKS_DST(x,y,z) { \ - ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \ - ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; } - -#define MUL_SUM_XOR_DST(a,c,dst) { \ - const uint64_t dst0 = ((uint64_t *)dst)[0]; \ - uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \ - hi += ((uint64_t *)c)[0]; \ - ((uint64_t *)c)[0] = dst0 ^ hi; \ - ((uint64_t *)dst)[0] = hi; \ - ((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \ +#if __CUDA_ARCH__ >= 500 +# define BYTE_0(x) __byte_perm(x, 0u, 0x4440) +# define BYTE_1(x) __byte_perm(x, 0u, 0x4441) +# define BYTE_2(x) __byte_perm(x, 0u, 0x4442) +# define BYTE_3(x) __byte_perm(x, 0u, 0x4443) + +# define ROTL32_8(x) __byte_perm(x, x, 0x2103) +# define ROTL32_16(x) __byte_perm(x, x, 0x1032) +# define ROTL32_24(x) __byte_perm(x, x, 0x0321) +#else +# define BYTE_0(x) (((x) ) & 0xff) +# define BYTE_1(x) (((x) >> 8) & 0xff) +# define BYTE_2(x) (((x) >> 16) & 0xff) +# define BYTE_3(x) (((x) >> 24)) + +# define ROTL32_8(x) ROTL32(x, 8) +# define ROTL32_16(x) ROTL32(x, 16) +# define ROTL32_24(x) ROTL32(x, 24) +#endif + +#define MEMSET8(dst, what, cnt) \ + { \ + int i_memset8; \ + uint64_t* out_memset8 = (uint64_t*)(dst); \ + for(i_memset8 = 0; i_memset8 < cnt; i_memset8++) \ + out_memset8[i_memset8] = (what); \ } -#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff))) +#define MEMSET4(dst, what, cnt) \ + { \ + int i_memset4; \ + uint32_t* out_memset4 = (uint32_t*)(dst); \ + for(i_memset4 = 0; i_memset4 < cnt; i_memset4++) \ + out_memset4[i_memset4] = (what); \ + } +#define MEMCPY8(dst, src, cnt) \ + { \ + int i_memcpy8; \ + uint64_t* in_memcpy8 = (uint64_t*)(src); \ + uint64_t* out_memcpy8 = (uint64_t*)(dst); \ + for(i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++) \ + out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; \ + } + +#define MEMCPY4(dst, src, cnt) \ + { \ + int i_memcpy4; \ + uint32_t* in_memcpy4 = (uint32_t*)(src); \ + uint32_t* out_memcpy4 = (uint32_t*)(dst); \ + for(i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++) \ + out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; \ + } + +#define XOR_BLOCKS(a, b) \ + { \ + ((uint64_t*)a)[0] ^= ((uint64_t*)b)[0]; \ + ((uint64_t*)a)[1] ^= ((uint64_t*)b)[1]; \ + } + +#define XOR_BLOCKS_DST(x, y, z) \ + { \ + ((uint64_t*)z)[0] = ((uint64_t*)(x))[0] ^ ((uint64_t*)(y))[0]; \ + ((uint64_t*)z)[1] = ((uint64_t*)(x))[1] ^ ((uint64_t*)(y))[1]; \ + } + +#define MUL_SUM_XOR_DST(a, c, dst) \ + { \ + const uint64_t dst0 = ((uint64_t*)dst)[0]; \ + uint64_t hi, lo = cuda_mul128(((uint64_t*)a)[0], dst0, &hi) + ((uint64_t*)c)[1]; \ + hi += ((uint64_t*)c)[0]; \ + ((uint64_t*)c)[0] = dst0 ^ hi; \ + ((uint64_t*)dst)[0] = hi; \ + ((uint64_t*)c)[1] = atomicExch(((unsigned long long int*)dst) + 1, (unsigned long long int)lo) ^ lo; \ + } + +#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff))) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp index 555ccbef2..a8dd1fcb2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp @@ -2,7 +2,6 @@ #include - __device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b) { diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index 0d54f1436..1fc85b2d0 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -18,19 +18,19 @@ __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b) { const uint32_t r = get_reciprocal(b); const uint32_t a1 = ((uint32_t*)&a)[1]; - const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a; + const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r)*a1) + a; const uint32_t q = ((uint32_t*)&k)[1]; - int64_t tmp = a - ((uint64_t)(q) * b); + int64_t tmp = a - ((uint64_t)(q)*b); ((int32_t*)(&tmp))[1] -= q < a1 ? b : 0; - + const int overshoot = ((int*)(&tmp))[1] >> 31; const int64_t tmp_u = (uint32_t)(b - 1) - tmp; const int undershoot = ((int*)&tmp_u)[1] >> 31; uint64_t result; ((uint32_t*)&result)[0] = q + overshoot - undershoot; - ((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b); + ((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot)&b) - ((uint32_t)(undershoot)&b); return result; } @@ -39,14 +39,18 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) { float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23)); float x1; - asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x)); - asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x)); + asm("rsqrt.approx.f32 %0, %1;" + : "=f"(x1) + : "f"(x)); + asm("sqrt.approx.f32 %0, %1;" + : "=f"(x) + : "f"(x)); // The following line does x1 *= 4294967296.0f; x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23)); const uint32_t x0 = __float_as_uint(x) - (158U << 23); - const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18); + const int64_t delta0 = n1 - (((int64_t)(x0)*x0) << 18); const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1; uint32_t result = (x0 << 10) + __float2int_rn(delta); @@ -56,6 +60,6 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1; const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0; const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0; - result += (overshoot+undershoot); + result += (overshoot + undershoot); return result; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp index d5a98b7da..3bec5b1a2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp @@ -4,173 +4,142 @@ #define GROESTL_LENGTHFIELDLEN GROESTL_ROWS #define GROESTL_COLS512 8 -#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512) +#define GROESTL_SIZE512 (GROESTL_ROWS * GROESTL_COLS512) #define GROESTL_ROUNDS512 10 #define GROESTL_HASH_BIT_LEN 256 #define GROESTL_ROTL32(v, n) ROTL32(v, n) - #define li_32(h) 0x##h##u -#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n))) +#define GROESTL_EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n))) -#define u32BIG(a) \ - ((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a,24) & li_32(FF00FF00))) +#define u32BIG(a) \ + ((GROESTL_ROTL32(a, 8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a, 24) & li_32(FF00FF00))) -typedef struct { - uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)]; /* actual state */ +typedef struct +{ + uint32_t chaining[GROESTL_SIZE512 / sizeof(uint32_t)]; /* actual state */ uint32_t block_counter1, - block_counter2; /* message block counter(s) */ - BitSequence buffer[GROESTL_SIZE512]; /* data buffer */ - int buf_ptr; /* data buffer pointer */ - int bits_in_last_byte; /* no. of message bits in last byte of data buffer */ + block_counter2; /* message block counter(s) */ + BitSequence buffer[GROESTL_SIZE512]; /* data buffer */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of data buffer */ } groestlHashState; - __constant__ uint32_t d_groestl_T[512] = -{ - 0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc -, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5 -, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d -, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded -, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1 -, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441 -, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4 -, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba -, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616 -, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2 -, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c -, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de -, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7 -, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e -, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c -, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7 -, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b -, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4 -, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e -, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a -, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37 -, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86 -, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b -, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028 -, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3 -, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94 -, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836 -, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0 -, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2 -, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e -, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3 -, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e -}; - -#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \ - { temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \ - v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \ - v1 = temp_var; } - -#define GROESTL_COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \ - tu = d_groestl_T[2*(uint32_t)x[4*c0+0]]; \ - tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1]; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]]; \ - tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1]; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - y[i] = tu; \ - y[i+1] = tl; - -__device__ void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r) + { + 0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e}; + +#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \ + { \ + temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \ + v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes))); \ + v1 = temp_var; \ + } + +#define GROESTL_COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \ + tu = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0]]; \ + tl = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0] + 1]; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tl ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0]]; \ + tu ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0] + 1]; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + y[i] = tu; \ + y[i + 1] = tl; + +__device__ void cn_groestl_RND512P(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r) { uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; uint32_t* x32 = (uint32_t*)x; - x32[ 0] ^= 0x00000000^r; - x32[ 2] ^= 0x00000010^r; - x32[ 4] ^= 0x00000020^r; - x32[ 6] ^= 0x00000030^r; - x32[ 8] ^= 0x00000040^r; - x32[10] ^= 0x00000050^r; - x32[12] ^= 0x00000060^r; - x32[14] ^= 0x00000070^r; - GROESTL_COLUMN(x,y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + x32[0] ^= 0x00000000 ^ r; + x32[2] ^= 0x00000010 ^ r; + x32[4] ^= 0x00000020 ^ r; + x32[6] ^= 0x00000030 ^ r; + x32[8] ^= 0x00000040 ^ r; + x32[10] ^= 0x00000050 ^ r; + x32[12] ^= 0x00000060 ^ r; + x32[14] ^= 0x00000070 ^ r; + GROESTL_COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } -__device__ void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r) +__device__ void cn_groestl_RND512Q(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r) { uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; uint32_t* x32 = (uint32_t*)x; - x32[ 0] = ~x32[ 0]; - x32[ 1] ^= 0xffffffff^r; - x32[ 2] = ~x32[ 2]; - x32[ 3] ^= 0xefffffff^r; - x32[ 4] = ~x32[ 4]; - x32[ 5] ^= 0xdfffffff^r; - x32[ 6] = ~x32[ 6]; - x32[ 7] ^= 0xcfffffff^r; - x32[ 8] = ~x32[ 8]; - x32[ 9] ^= 0xbfffffff^r; + x32[0] = ~x32[0]; + x32[1] ^= 0xffffffff ^ r; + x32[2] = ~x32[2]; + x32[3] ^= 0xefffffff ^ r; + x32[4] = ~x32[4]; + x32[5] ^= 0xdfffffff ^ r; + x32[6] = ~x32[6]; + x32[7] ^= 0xcfffffff ^ r; + x32[8] = ~x32[8]; + x32[9] ^= 0xbfffffff ^ r; x32[10] = ~x32[10]; - x32[11] ^= 0xafffffff^r; + x32[11] ^= 0xafffffff ^ r; x32[12] = ~x32[12]; - x32[13] ^= 0x9fffffff^r; + x32[13] ^= 0x9fffffff ^ r; x32[14] = ~x32[14]; - x32[15] ^= 0x8fffffff^r; - GROESTL_COLUMN(x,y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + x32[15] ^= 0x8fffffff ^ r; + GROESTL_COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } -__device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m) +__device__ void cn_groestl_F512(uint32_t* __restrict__ h, const uint32_t* __restrict__ m) { int i; - uint32_t Ptmp[2*GROESTL_COLS512]; - uint32_t Qtmp[2*GROESTL_COLS512]; - uint32_t y[2*GROESTL_COLS512]; - uint32_t z[2*GROESTL_COLS512]; + uint32_t Ptmp[2 * GROESTL_COLS512]; + uint32_t Qtmp[2 * GROESTL_COLS512]; + uint32_t y[2 * GROESTL_COLS512]; + uint32_t z[2 * GROESTL_COLS512]; - for (i = 0; i < 2*GROESTL_COLS512; i++) + for(i = 0; i < 2 * GROESTL_COLS512; i++) { z[i] = m[i]; - Ptmp[i] = h[i]^m[i]; + Ptmp[i] = h[i] ^ m[i]; } cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000); @@ -195,18 +164,18 @@ __device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __re cn_groestl_RND512P((uint8_t*)z, y, 0x00000008); cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009); - for (i = 0; i < 2*GROESTL_COLS512; i++) - h[i] ^= Ptmp[i]^Qtmp[i]; + for(i = 0; i < 2 * GROESTL_COLS512; i++) + h[i] ^= Ptmp[i] ^ Qtmp[i]; } -__device__ void cn_groestl_outputtransformation(groestlHashState *ctx) +__device__ void cn_groestl_outputtransformation(groestlHashState* ctx) { int j; - uint32_t temp[2*GROESTL_COLS512]; - uint32_t y[2*GROESTL_COLS512]; - uint32_t z[2*GROESTL_COLS512]; + uint32_t temp[2 * GROESTL_COLS512]; + uint32_t y[2 * GROESTL_COLS512]; + uint32_t z[2 * GROESTL_COLS512]; - for (j = 0; j < 2*GROESTL_COLS512; j++) + for(j = 0; j < 2 * GROESTL_COLS512; j++) temp[j] = ctx->chaining[j]; cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000); @@ -220,33 +189,33 @@ __device__ void cn_groestl_outputtransformation(groestlHashState *ctx) cn_groestl_RND512P((uint8_t*)z, y, 0x00000008); cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009); - for (j = 0; j < 2*GROESTL_COLS512; j++) + for(j = 0; j < 2 * GROESTL_COLS512; j++) ctx->chaining[j] ^= temp[j]; } -__device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx, - const uint8_t * __restrict__ input, int msglen) +__device__ void cn_groestl_transform(groestlHashState* __restrict__ ctx, + const uint8_t* __restrict__ input, int msglen) { - for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) + for(; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) { - cn_groestl_F512(ctx->chaining,(uint32_t*)input); + cn_groestl_F512(ctx->chaining, (uint32_t*)input); ctx->block_counter1++; - if (ctx->block_counter1 == 0) + if(ctx->block_counter1 == 0) ctx->block_counter2++; } } -__device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, - BitSequence* __restrict__ output) +__device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, + BitSequence* __restrict__ output) { - int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8; - uint8_t *s = (BitSequence*)ctx->chaining; + int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN / 8; + uint8_t* s = (BitSequence*)ctx->chaining; - if (ctx->bits_in_last_byte) + if(ctx->bits_in_last_byte) { - ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte); - ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte); + ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << ctx->bits_in_last_byte) - 1) << (8 - ctx->bits_in_last_byte); + ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - ctx->bits_in_last_byte); ctx->bits_in_last_byte = 0; } else @@ -254,29 +223,29 @@ __device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, ctx->buffer[(int)ctx->buf_ptr++] = 0x80; } - if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + if(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN) { - while (ctx->buf_ptr < GROESTL_SIZE512) + while(ctx->buf_ptr < GROESTL_SIZE512) ctx->buffer[(int)ctx->buf_ptr++] = 0; cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); ctx->buf_ptr = 0; } - while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + while(ctx->buf_ptr < GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN) ctx->buffer[(int)ctx->buf_ptr++] = 0; ctx->block_counter1++; - if (ctx->block_counter1 == 0) + if(ctx->block_counter1 == 0) ctx->block_counter2++; ctx->buf_ptr = GROESTL_SIZE512; - while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t)) + while(ctx->buf_ptr > GROESTL_SIZE512 - (int)sizeof(uint32_t)) { ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1; ctx->block_counter1 >>= 8; } - while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + while(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN) { ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2; ctx->block_counter2 >>= 8; @@ -284,12 +253,12 @@ __device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); cn_groestl_outputtransformation(ctx); - for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) + for(i = GROESTL_SIZE512 - hashbytelen; i < GROESTL_SIZE512; i++, j++) output[j] = s[i]; - for (i = 0; i < GROESTL_COLS512; i++) + for(i = 0; i < GROESTL_COLS512; i++) ctx->chaining[i] = 0; - for (i = 0; i < GROESTL_SIZE512; i++) + for(i = 0; i < GROESTL_SIZE512; i++) ctx->buffer[i] = 0; } @@ -297,17 +266,17 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx, const BitSequence* __restrict__ input, DataLength databitlen) { int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); + int msglen = (int)(databitlen / 8); + int rem = (int)(databitlen % 8); - if (ctx->buf_ptr) + if(ctx->buf_ptr) { - while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen) + while(ctx->buf_ptr < GROESTL_SIZE512 && index < msglen) ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - if (ctx->buf_ptr < GROESTL_SIZE512) + if(ctx->buf_ptr < GROESTL_SIZE512) { - if (rem) + if(rem) { ctx->bits_in_last_byte = rem; ctx->buffer[(int)ctx->buf_ptr++] = input[index]; @@ -319,13 +288,13 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx, cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); } - cn_groestl_transform(ctx, input+index, msglen-index); - index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512; + cn_groestl_transform(ctx, input + index, msglen - index); + index += ((msglen - index) / GROESTL_SIZE512) * GROESTL_SIZE512; - while (index < msglen) + while(index < msglen) ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - if (rem) + if(rem) { ctx->bits_in_last_byte = rem; ctx->buffer[(int)ctx->buf_ptr++] = input[index]; @@ -336,17 +305,17 @@ __device__ void cn_groestl_init(groestlHashState* ctx) { int i = 0; - for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++) + for(; i < (GROESTL_SIZE512 / sizeof(uint32_t)); i++) ctx->chaining[i] = 0; - ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN); + ctx->chaining[2 * GROESTL_COLS512 - 1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN); ctx->buf_ptr = 0; ctx->block_counter1 = 0; ctx->block_counter2 = 0; ctx->bits_in_last_byte = 0; } -__device__ void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +__device__ void cn_groestl(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval) { DataLength databitlen = len << 3; groestlHashState context; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp index 284039ff4..1019a9b9c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp @@ -1,6 +1,7 @@ #include -typedef struct { +typedef struct +{ int hashbitlen; unsigned long long databitlen; unsigned long long datasize_in_buffer; @@ -9,159 +10,175 @@ typedef struct { } jhHashState; __constant__ unsigned char d_JH256_H0[512] = -{ - 0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, - 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, - 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, - 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, - 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, - 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, - 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, - 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69 -}; + { + 0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, + 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, + 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, + 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, + 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, + 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, + 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, + 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69}; __constant__ unsigned char d_E8_rc[42][32] = -{ - {0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40}, - {0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31}, - {0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc}, - {0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3}, - {0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23}, - {0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97}, - {0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14}, - {0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4}, - {0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36}, - {0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f}, - {0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b}, - {0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62}, - {0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5}, - {0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f}, - {0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a}, - {0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf}, - {0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0}, - {0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a}, - {0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6}, - {0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67}, - {0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18}, - {0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e}, - {0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1}, - {0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83}, - {0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef}, - {0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65}, - {0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c}, - {0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71}, - {0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0}, - {0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f}, - {0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad}, - {0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6}, - {0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63}, - {0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f}, - {0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a}, - {0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5}, - {0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48}, - {0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e}, - {0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7}, - {0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde}, - {0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a}, - {0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2} -}; - -#define JH_SWAP1(x) (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1)); -#define JH_SWAP2(x) (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2)); -#define JH_SWAP4(x) (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4)); -#define JH_SWAP8(x) (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8)); -#define JH_SWAP16(x) (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16)); -#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); - -#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) ^= (m1); \ - (m5) ^= (m2); \ - (m6) ^= (m0) ^ (m3); \ - (m7) ^= (m0); \ - (m0) ^= (m5); \ - (m1) ^= (m6); \ - (m2) ^= (m4) ^ (m7); \ + { + {0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40}, + {0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31}, + {0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc}, + {0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3}, + {0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23}, + {0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97}, + {0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14}, + {0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4}, + {0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36}, + {0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f}, + {0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b}, + {0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62}, + {0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5}, + {0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f}, + {0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a}, + {0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf}, + {0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0}, + {0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a}, + {0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6}, + {0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67}, + {0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18}, + {0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e}, + {0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1}, + {0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83}, + {0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef}, + {0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65}, + {0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c}, + {0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71}, + {0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0}, + {0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f}, + {0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad}, + {0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6}, + {0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63}, + {0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f}, + {0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a}, + {0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5}, + {0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48}, + {0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e}, + {0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7}, + {0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde}, + {0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a}, + {0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}}; + +#define JH_SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1)); +#define JH_SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2)); +#define JH_SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4)); +#define JH_SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8)); +#define JH_SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16)); +#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); + +#define JH_L(m0, m1, m2, m3, m4, m5, m6, m7) \ + (m4) ^= (m1); \ + (m5) ^= (m2); \ + (m6) ^= (m0) ^ (m3); \ + (m7) ^= (m0); \ + (m0) ^= (m5); \ + (m1) ^= (m6); \ + (m2) ^= (m4) ^ (m7); \ (m3) ^= (m4); -#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1) \ - m3 = ~(m3); \ - m7 = ~(m7); \ - m0 ^= ((~(m2)) & (cc0)); \ - m4 ^= ((~(m6)) & (cc1)); \ - temp0 = (cc0) ^ ((m0) & (m1));\ - temp1 = (cc1) ^ ((m4) & (m5));\ - m0 ^= ((m2) & (m3)); \ - m4 ^= ((m6) & (m7)); \ - m3 ^= ((~(m1)) & (m2)); \ - m7 ^= ((~(m5)) & (m6)); \ - m1 ^= ((m0) & (m2)); \ - m5 ^= ((m4) & (m6)); \ - m2 ^= ((m0) & (~(m3))); \ - m6 ^= ((m4) & (~(m7))); \ - m0 ^= ((m1) | (m3)); \ - m4 ^= ((m5) | (m7)); \ - m3 ^= ((m1) & (m2)); \ - m7 ^= ((m5) & (m6)); \ - m1 ^= (temp0 & (m0)); \ - m5 ^= (temp1 & (m4)); \ - m2 ^= temp0; \ +#define JH_SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \ + m3 = ~(m3); \ + m7 = ~(m7); \ + m0 ^= ((~(m2)) & (cc0)); \ + m4 ^= ((~(m6)) & (cc1)); \ + temp0 = (cc0) ^ ((m0) & (m1)); \ + temp1 = (cc1) ^ ((m4) & (m5)); \ + m0 ^= ((m2) & (m3)); \ + m4 ^= ((m6) & (m7)); \ + m3 ^= ((~(m1)) & (m2)); \ + m7 ^= ((~(m5)) & (m6)); \ + m1 ^= ((m0) & (m2)); \ + m5 ^= ((m4) & (m6)); \ + m2 ^= ((m0) & (~(m3))); \ + m6 ^= ((m4) & (~(m7))); \ + m0 ^= ((m1) | (m3)); \ + m4 ^= ((m5) | (m7)); \ + m3 ^= ((m1) & (m2)); \ + m7 ^= ((m5) & (m6)); \ + m1 ^= (temp0 & (m0)); \ + m5 ^= (temp1 & (m4)); \ + m2 ^= temp0; \ m6 ^= temp1; -__device__ void cn_jh_E8(jhHashState *state) +__device__ void cn_jh_E8(jhHashState* state) { - uint64_t i,roundnumber,temp0,temp1; + uint64_t i, roundnumber, temp0, temp1; - for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) + for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7) { - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP1(state->x[1][i]); + JH_SWAP1(state->x[3][i]); + JH_SWAP1(state->x[5][i]); + JH_SWAP1(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP2(state->x[1][i]); + JH_SWAP2(state->x[3][i]); + JH_SWAP2(state->x[5][i]); + JH_SWAP2(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP4(state->x[1][i]); + JH_SWAP4(state->x[3][i]); + JH_SWAP4(state->x[5][i]); + JH_SWAP4(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP8(state->x[1][i]); + JH_SWAP8(state->x[3][i]); + JH_SWAP8(state->x[5][i]); + JH_SWAP8(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP16(state->x[1][i]); + JH_SWAP16(state->x[3][i]); + JH_SWAP16(state->x[5][i]); + JH_SWAP16(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP32(state->x[1][i]); + JH_SWAP32(state->x[3][i]); + JH_SWAP32(state->x[5][i]); + JH_SWAP32(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); } - for (i = 1; i < 8; i = i+2) + for(i = 1; i < 8; i = i + 2) { temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; @@ -170,75 +187,75 @@ __device__ void cn_jh_E8(jhHashState *state) } } -__device__ void cn_jh_F8(jhHashState *state) +__device__ void cn_jh_F8(jhHashState* state) { uint64_t i; - for (i = 0; i < 8; i++) - state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i]; + for(i = 0; i < 8; i++) + state->x[i >> 1][i & 1] ^= ((uint64_t*)state->buffer)[i]; cn_jh_E8(state); - for (i = 0; i < 8; i++) - state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i]; + for(i = 0; i < 8; i++) + state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64_t*)state->buffer)[i]; } -__device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +__device__ void cn_jh_update(jhHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen) { DataLength index; state->databitlen += databitlen; index = 0; - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512)) { - if ( (databitlen & 7) == 0 ) - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)); + if((databitlen & 7) == 0) + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); else - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1); + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1); state->datasize_in_buffer += databitlen; databitlen = 0; } - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512)) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ); - index = 64-(state->datasize_in_buffer >> 3); + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); + index = 64 - (state->datasize_in_buffer >> 3); databitlen = databitlen - (512 - state->datasize_in_buffer); cn_jh_F8(state); state->datasize_in_buffer = 0; } - for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) + for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512) { - memcpy(state->buffer, data+index, 64); + memcpy(state->buffer, data + index, 64); cn_jh_F8(state); } - if ( databitlen > 0) + if(databitlen > 0) { - if ((databitlen & 7) == 0) - memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3); + if((databitlen & 7) == 0) + memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3); else - memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1); + memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1); state->datasize_in_buffer = databitlen; } } /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/ -__device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval) +__device__ void cn_jh_final(jhHashState* __restrict__ state, BitSequence* __restrict__ hashval) { unsigned int i; //uint32_t *bufptr = (uint32_t *)state->buffer; - if ( (state->databitlen & 0x1ff) == 0 ) + if((state->databitlen & 0x1ff) == 0) { /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ memset(state->buffer, 0, 64); //for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000; - state->buffer[0] = 0x80; + state->buffer[0] = 0x80; state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; state->buffer[61] = (state->databitlen >> 16) & 0xff; state->buffer[60] = (state->databitlen >> 24) & 0xff; state->buffer[59] = (state->databitlen >> 32) & 0xff; @@ -250,19 +267,19 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re else { /*set the rest of the bytes in the buffer to 0*/ - if ( (state->datasize_in_buffer & 7) == 0) + if((state->datasize_in_buffer & 7) == 0) { - for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) + for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; } else { - for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) + for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++) state->buffer[i] = 0; } /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ - state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7)); + state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7)); cn_jh_F8(state); memset(state->buffer, 0, 64); @@ -278,10 +295,10 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re cn_jh_F8(state); } - memcpy(hashval,(unsigned char*)state->x+64+32,32); + memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32); } -__device__ void cn_jh_init(jhHashState *state, int hashbitlen) +__device__ void cn_jh_init(jhHashState* state, int hashbitlen) { state->databitlen = 0; state->datasize_in_buffer = 0; @@ -289,7 +306,7 @@ __device__ void cn_jh_init(jhHashState *state, int hashbitlen) memcpy(state->x, d_JH256_H0, 128); } -__device__ void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +__device__ void cn_jh(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp index 3f535631d..5bbc787e3 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp @@ -7,46 +7,61 @@ __constant__ #else const #endif -uint64_t keccakf_rndc[24] ={ - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 -}; + uint64_t keccakf_rndc[24] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008}; #if __CUDA_ARCH__ >= 350 - __forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) - { - uint2 result; - if(offset >= 32) - { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } - else - { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); - } - #define rotl64_1(x, y) (cuda_rotl64((x), (y))) +/** @param offset must be < 32 + */ +__forceinline__ __device__ uint64_t cuda_rotl64(const uint32_t v0, const uint32_t v1, const int offset) +{ + uint2 result; + + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(v0), "r"(v1), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(v1), "r"(v0), "r"(offset)); + + return *((uint64_t*)&result); +} +__device__ __forceinline__ uint64_t rotl64_1(const uint64_t x, const int y) +{ + return cuda_rotl64(((uint32_t*)&x)[0], ((uint32_t*)&x)[1], (y)); +} + +__device__ __forceinline__ uint64_t rotl64_2(const uint64_t x, const int y) +{ + return cuda_rotl64(((uint32_t*)&x)[1], ((uint32_t*)&x)[0], (y)); +} + #else - #define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y)))) + +#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y)))) +__device__ __forceinline__ uint64_t rotl64_2(const uint64_t x, const int y) +{ + uint64_t tmp; + ((uint32_t*)&tmp)[0] = ((uint32_t*)&x)[1]; + ((uint32_t*)&tmp)[1] = ((uint32_t*)&x)[0]; + + return rotl64_1(tmp, (y)); +} #endif -#define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y)) + #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -__device__ __forceinline__ void cn_keccakf2(uint64_t *s) +__device__ __forceinline__ void cn_keccakf2(uint64_t* s) { - uint8_t i; - - for(i = 0; i < 24; ++i) + for(int16_t i = 0; i < 24; ++i) { uint64_t bc[5], tmpxor[5], tmp1, tmp2; @@ -90,16 +105,46 @@ __device__ __forceinline__ void cn_keccakf2(uint64_t *s) s[7] = rotl64_1(s[10] ^ bc[4], 3); s[10] = rotl64_1(tmp1, 1); - tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); - tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); - tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); - tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); - tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + tmp1 = s[0]; + tmp2 = s[1]; + s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); + s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); + s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); + s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); + s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; + tmp2 = s[6]; + s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); + s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); + s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); + s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); + s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; + tmp2 = s[11]; + s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); + s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); + s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); + s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); + s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; + tmp2 = s[16]; + s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); + s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); + s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); + s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); + s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; + tmp2 = s[21]; + s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); + s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); + s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); + s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); + s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); s[0] ^= keccakf_rndc[i]; } } -__device__ __forceinline__ void cn_keccakf(uint64_t *s) +__device__ __forceinline__ void cn_keccakf(uint64_t* s) { uint64_t bc[5], tmpxor[5], tmp1, tmp2; @@ -145,16 +190,46 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s) s[7] = rotl64_1(s[10] ^ bc[4], 3); s[10] = rotl64_1(tmp1, 1); - tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); - tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); - tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); - tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); - tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + tmp1 = s[0]; + tmp2 = s[1]; + s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); + s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); + s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); + s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); + s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; + tmp2 = s[6]; + s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); + s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); + s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); + s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); + s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; + tmp2 = s[11]; + s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); + s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); + s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); + s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); + s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; + tmp2 = s[16]; + s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); + s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); + s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); + s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); + s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; + tmp2 = s[21]; + s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); + s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); + s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); + s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); + s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); s[0] ^= keccakf_rndc[i]; } } -__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md) +__device__ __forceinline__ void cn_keccak(const uint8_t* __restrict__ in, uint32_t len, uint8_t* __restrict__ md) { uint64_t st[25]; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp index fc45db1ae..b8073f03b 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp @@ -1,124 +1,146 @@ #pragma once -typedef unsigned int uint_t; /* native unsigned integer */ +typedef unsigned int uint_t; /* native unsigned integer */ -#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ +#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */ -#define SKEIN_256_STATE_WORDS ( 4) -#define SKEIN_512_STATE_WORDS ( 8) +#define SKEIN_256_STATE_WORDS (4) +#define SKEIN_512_STATE_WORDS (8) #define SKEIN1024_STATE_WORDS (16) -#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) -#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) +#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32)) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22) -#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ +#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word */ -#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ -#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ -#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ -#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ -#define SKEIN_T1_FLAG_FIRST (((uint64_t) 1 ) << SKEIN_T1_POS_FIRST) -#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t) 1 ) << SKEIN_T1_POS_BIT_PAD) -#define SKEIN_T1_FLAG_FINAL (((uint64_t) 1 ) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_FIRST (((uint64_t)1) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t)1) << SKEIN_T1_POS_BIT_PAD) +#define SKEIN_T1_FLAG_FINAL (((uint64_t)1) << SKEIN_T1_POS_FINAL) -#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ -#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ -#define SKEIN_T1_BLK_TYPE(T) (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE(T) (((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) -#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ -#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ -#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) -#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} - -#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) -#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) - -#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \ - Skein_Set_T0(ctxPtr,(T0)); \ - Skein_Set_T1(ctxPtr,(T1)); } - -#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ -{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } - -#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } +#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \ + { \ + (ctxPtr)->h.T[TWK_NUM] = (tVal); \ + } -#define KW_TWK_BASE (0) -#define KW_KEY_BASE (3) -#define ks (kw + KW_KEY_BASE) -#define ts (kw + KW_TWK_BASE) +#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0) +#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1) -#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \ - X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \ - X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \ - X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6; +#define Skein_Set_T0_T1(ctxPtr, T0, T1) \ + { \ + Skein_Set_T0(ctxPtr, (T0)); \ + Skein_Set_T1(ctxPtr, (T1)); \ + } -#define I512(R) \ - X0 += ks[((R)+1) % 9]; \ - X1 += ks[((R)+2) % 9]; \ - X2 += ks[((R)+3) % 9]; \ - X3 += ks[((R)+4) % 9]; \ - X4 += ks[((R)+5) % 9]; \ - X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ - X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ - X7 += ks[((R)+8) % 9] + (R)+1; +#define Skein_Start_New_Type(ctxPtr, BLK_TYPE) \ + { \ + Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \ + (ctxPtr)->h.bCnt = 0; \ + } +#define Skein_Set_Bit_Pad_Flag(hdr) \ + { \ + (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \ + } -#define R512_8_rounds(R) \ - R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ - R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ - R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ - R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ - I512(2*(R)); \ - R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ - R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ - R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ - R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ - I512(2*(R)+1); +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, R512ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = ROTL64(X##p1, R512ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = ROTL64(X##p3, R512ROT##_1); \ + X##p3 ^= X##p2; \ + X##p4 += X##p5; \ + X##p5 = ROTL64(X##p5, R512ROT##_2); \ + X##p5 ^= X##p4; \ + X##p6 += X##p7; \ + X##p7 = ROTL64(X##p7, R512ROT##_3); \ + X##p7 ^= X##p6; + +#define I512(R) \ + X0 += ks[((R) + 1) % 9]; \ + X1 += ks[((R) + 2) % 9]; \ + X2 += ks[((R) + 3) % 9]; \ + X3 += ks[((R) + 4) % 9]; \ + X4 += ks[((R) + 5) % 9]; \ + X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ + X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ + X7 += ks[((R) + 8) % 9] + (R) + 1; + +#define R512_8_rounds(R) \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ + I512(2 * (R)); \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ + I512(2 * (R) + 1); typedef struct { - size_t hashBitLen; - size_t bCnt; - uint64_t T[SKEIN_MODIFIER_WORDS]; + size_t hashBitLen; + size_t bCnt; + uint64_t T[SKEIN_MODIFIER_WORDS]; } Skein_Ctxt_Hdr_t; -typedef struct { +typedef struct +{ Skein_Ctxt_Hdr_t h; - uint64_t X[SKEIN_256_STATE_WORDS]; - uint8_t b[SKEIN_256_BLOCK_BYTES]; + uint64_t X[SKEIN_256_STATE_WORDS]; + uint8_t b[SKEIN_256_BLOCK_BYTES]; } Skein_256_Ctxt_t; -typedef struct { +typedef struct +{ Skein_Ctxt_Hdr_t h; - uint64_t X[SKEIN_512_STATE_WORDS]; - uint8_t b[SKEIN_512_BLOCK_BYTES]; + uint64_t X[SKEIN_512_STATE_WORDS]; + uint8_t b[SKEIN_512_BLOCK_BYTES]; } Skein_512_Ctxt_t; -typedef struct { +typedef struct +{ Skein_Ctxt_Hdr_t h; - uint64_t X[SKEIN1024_STATE_WORDS]; - uint8_t b[SKEIN1024_BLOCK_BYTES]; + uint64_t X[SKEIN1024_STATE_WORDS]; + uint8_t b[SKEIN1024_BLOCK_BYTES]; } Skein1024_Ctxt_t; -typedef struct { - uint_t statebits; +typedef struct +{ + uint_t statebits; union { Skein_Ctxt_Hdr_t h; Skein_256_Ctxt_t ctx_256; @@ -127,21 +149,20 @@ typedef struct { } u; } skeinHashState; -__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen) +__device__ void cn_skein_init(skeinHashState* state, size_t hashBitLen) { const uint64_t SKEIN_512_IV_256[] = - { - SKEIN_MK_64(0xCCD044A1,0x2FDB3E13), - SKEIN_MK_64(0xE8359030,0x1A79A9EB), - SKEIN_MK_64(0x55AEA061,0x4F816E6F), - SKEIN_MK_64(0x2A2767A4,0xAE9B94DB), - SKEIN_MK_64(0xEC06025E,0x74DD7683), - SKEIN_MK_64(0xE7A436CD,0xC4746251), - SKEIN_MK_64(0xC36FBAF9,0x393AD185), - SKEIN_MK_64(0x3EEDBA18,0x33EDFC13) - }; + { + SKEIN_MK_64(0xCCD044A1, 0x2FDB3E13), + SKEIN_MK_64(0xE8359030, 0x1A79A9EB), + SKEIN_MK_64(0x55AEA061, 0x4F816E6F), + SKEIN_MK_64(0x2A2767A4, 0xAE9B94DB), + SKEIN_MK_64(0xEC06025E, 0x74DD7683), + SKEIN_MK_64(0xE7A436CD, 0xC4746251), + SKEIN_MK_64(0xC36FBAF9, 0x393AD185), + SKEIN_MK_64(0x3EEDBA18, 0x33EDFC13)}; - Skein_512_Ctxt_t *ctx = &state->u.ctx_512; + Skein_512_Ctxt_t* ctx = &state->u.ctx_512; ctx->h.hashBitLen = hashBitLen; @@ -150,22 +171,47 @@ __device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen) Skein_Start_New_Type(ctx, MSG); } -__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) +__device__ void cn_skein512_processblock(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) { - enum { - R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, - R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, - R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, - R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, - R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, - R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, - R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, - R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22 + enum + { + R_512_0_0 = 46, + R_512_0_1 = 36, + R_512_0_2 = 19, + R_512_0_3 = 37, + R_512_1_0 = 33, + R_512_1_1 = 27, + R_512_1_2 = 14, + R_512_1_3 = 42, + R_512_2_0 = 17, + R_512_2_1 = 49, + R_512_2_2 = 36, + R_512_2_3 = 39, + R_512_3_0 = 44, + R_512_3_1 = 9, + R_512_3_2 = 54, + R_512_3_3 = 56, + R_512_4_0 = 39, + R_512_4_1 = 30, + R_512_4_2 = 34, + R_512_4_3 = 24, + R_512_5_0 = 13, + R_512_5_1 = 50, + R_512_5_2 = 10, + R_512_5_3 = 17, + R_512_6_0 = 25, + R_512_6_1 = 29, + R_512_6_2 = 39, + R_512_6_3 = 43, + R_512_7_0 = 8, + R_512_7_1 = 35, + R_512_7_2 = 56, + R_512_7_3 = 22 }; - uint64_t X0,X1,X2,X3,X4,X5,X6,X7; + uint64_t X0, X1, X2, X3, X4, X5, X6, X7; uint64_t w[SKEIN_512_STATE_WORDS]; - uint64_t kw[SKEIN_512_STATE_WORDS+4]; + uint64_t kw[SKEIN_512_STATE_WORDS + 4]; ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; @@ -184,7 +230,7 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co ks[6] = ctx->X[6]; ks[7] = ctx->X[7]; ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ - ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; ts[2] = ts[0] ^ ts[1]; @@ -201,15 +247,15 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co blkPtr += SKEIN_512_BLOCK_BYTES; - R512_8_rounds( 0); - R512_8_rounds( 1); - R512_8_rounds( 2); - R512_8_rounds( 3); - R512_8_rounds( 4); - R512_8_rounds( 5); - R512_8_rounds( 6); - R512_8_rounds( 7); - R512_8_rounds( 8); + R512_8_rounds(0); + R512_8_rounds(1); + R512_8_rounds(2); + R512_8_rounds(3); + R512_8_rounds(4); + R512_8_rounds(5); + R512_8_rounds(6); + R512_8_rounds(7); + R512_8_rounds(8); ctx->X[0] = X0 ^ w[0]; ctx->X[1] = X1 ^ w[1]; @@ -221,125 +267,124 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co ctx->X[7] = X7 ^ w[7]; ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } -__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) +__device__ void cn_skein_final(skeinHashState* __restrict__ state, uint8_t* __restrict__ hashVal) { - size_t i,n,byteCnt; + size_t i, n, byteCnt; uint64_t X[SKEIN_512_STATE_WORDS]; - Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; + Skein_512_Ctxt_t* ctx = (Skein_512_Ctxt_t*)&state->u.ctx_512; //size_t tmp; //uint8_t *p8; //uint64_t *p64; ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; - if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) { - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); //p8 = &ctx->b[ctx->h.bCnt]; //tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; //for( i = 0; i < tmp; i++ ) *(p8+i) = 0; } - cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt); + cn_skein512_processblock(ctx, ctx->b, 1, ctx->h.bCnt); byteCnt = (ctx->h.hashBitLen + 7) >> 3; //uint8_t b[SKEIN_512_BLOCK_BYTES] == 64 - memset(ctx->b,0,sizeof(ctx->b)); + memset(ctx->b, 0, sizeof(ctx->b)); //p64 = (uint64_t *)ctx->b; //for( i = 0; i < 8; i++ ) *(p64+i) = 0; - memcpy(X,ctx->X,sizeof(X)); + memcpy(X, ctx->X, sizeof(X)); - for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) { - ((uint64_t *)ctx->b)[0]= (uint64_t)i; - Skein_Start_New_Type(ctx,OUT_FINAL); - cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t)); - n = byteCnt - i*SKEIN_512_BLOCK_BYTES; - if (n >= SKEIN_512_BLOCK_BYTES) - n = SKEIN_512_BLOCK_BYTES; - memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + ((uint64_t*)ctx->b)[0] = (uint64_t)i; + Skein_Start_New_Type(ctx, OUT_FINAL); + cn_skein512_processblock(ctx, ctx->b, 1, sizeof(uint64_t)); + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; + if(n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + memcpy(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } } -__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt) +__device__ void cn_skein512_update(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ msg, size_t msgByteCnt) { size_t n; - if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { - if (ctx->h.bCnt) + if(ctx->h.bCnt) { n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; - if (n) + if(n) { - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; } - cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + cn_skein512_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; } - if (msgByteCnt > SKEIN_512_BLOCK_BYTES) + if(msgByteCnt > SKEIN_512_BLOCK_BYTES) { - n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; - cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; + cn_skein512_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES); msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; - msg += n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; } } - if (msgByteCnt) + if(msgByteCnt) { - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; } } -__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +__device__ void cn_skein_update(skeinHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen) { - if ((databitlen & 7) == 0) + if((databitlen & 7) == 0) { - cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3); + cn_skein512_update(&state->u.ctx_512, data, databitlen >> 3); } else { size_t bCnt = (databitlen >> 3) + 1; - uint8_t b,mask; + uint8_t b, mask; - mask = (uint8_t) (1u << (7 - (databitlen & 7))); - b = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask); + mask = (uint8_t)(1u << (7 - (databitlen & 7))); + b = (uint8_t)((data[bCnt - 1] & (0 - mask)) | mask); - cn_skein512_update(&state->u.ctx_512,data,bCnt-1); - cn_skein512_update(&state->u.ctx_512,&b , 1 ); + cn_skein512_update(&state->u.ctx_512, data, bCnt - 1); + cn_skein512_update(&state->u.ctx_512, &b, 1); Skein_Set_Bit_Pad_Flag(state->u.h); } } -__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +__device__ void cn_skein(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; skeinHashState state; - state.statebits = 64*SKEIN_512_STATE_WORDS; + state.statebits = 64 * SKEIN_512_STATE_WORDS; cn_skein_init(&state, hashbitlen); cn_skein_update(&state, data, databitlen); diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp index 5c7dfe16a..560507691 100644 --- a/xmrstak/backend/plugin.hpp +++ b/xmrstak/backend/plugin.hpp @@ -3,22 +3,22 @@ #include "xmrstak/misc/environment.hpp" #include "xmrstak/params.hpp" -#include -#include -#include -#include #include "iBackend.hpp" +#include #include +#include +#include +#include #ifndef USE_PRECOMPILED_HEADERS -# ifdef WIN32 -# include -# include -# else -# include -# include -# endif -# include +#ifdef WIN32 +#include +#include +#else +#include +#include +#endif +#include #endif namespace xmrstak @@ -36,41 +36,41 @@ struct plugin libBackend = LoadLibrary(TEXT((libName + ".dll").c_str())); if(!libBackend) { - std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << (libName + ".dll") << std::endl; + std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << (libName + ".dll") << std::endl; return; } #else // `.so` linux file extention for dynamic libraries std::string fileExtension = ".so"; -# if defined(__APPLE__) +#if defined(__APPLE__) // `.dylib` Mac OS X file extention for dynamic libraries fileExtension = ".dylib"; -# endif +#endif // search library in working directory - libBackend = dlopen(("./lib" + libName + fileExtension).c_str(), RTLD_LAZY); + libBackend = dlopen(("./lib" + libName + fileExtension).c_str(), RTLD_NOW | RTLD_LAZY | RTLD_GLOBAL); // fallback to binary directory if(!libBackend) - libBackend = dlopen((params::inst().executablePrefix + "lib" + libName + fileExtension).c_str(), RTLD_LAZY); + libBackend = dlopen((params::inst().executablePrefix + "lib" + libName + fileExtension).c_str(), RTLD_NOW | RTLD_LAZY | RTLD_GLOBAL); // try use LD_LIBRARY_PATH if(!libBackend) - libBackend = dlopen(("lib" + libName + fileExtension).c_str(), RTLD_LAZY); + libBackend = dlopen(("lib" + libName + fileExtension).c_str(), RTLD_NOW | RTLD_LAZY | RTLD_GLOBAL); if(!libBackend) { - std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << dlerror() << std::endl; + std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << dlerror() << std::endl; return; } #endif #ifdef WIN32 - fn_startBackend = (startBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend"); - if (!fn_startBackend) + fn_startBackend = (startBackend_t)GetProcAddress(libBackend, "xmrstak_start_backend"); + if(!fn_startBackend) { - std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " < +#include #include +#include #include -#include #include -#include #ifndef CONF_NO_TLS -#include #include +#include #endif #ifdef _WIN32 -# define strcasecmp _stricmp -# include -# include "xmrstak/misc/uac.hpp" +#define strcasecmp _stricmp +#include "xmrstak/misc/uac.hpp" +#include #endif // _WIN32 int do_benchmark(int block_version, int wait_sec, int work_sec); @@ -62,72 +61,79 @@ void help() using namespace std; using namespace xmrstak; - cout<<"Usage: "<> tmp; + getline(std::cin, tmp); + if(tmp.empty()) + tmp = default_value; std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower); - } - while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no"); + } while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no"); return tmp == "y" || tmp == "yes"; } @@ -139,34 +145,37 @@ inline const char* bool_to_str(bool v) std::string get_multipool_entry(bool& final) { - std::cout<> pool; std::string userName; - std::cout<<"- Username (wallet address or pool login):"<> userName; std::string passwd; - std::cin.clear(); std::cin.ignore(INT_MAX,'\n'); - std::cout<<"- Password (mostly empty or x):"<> pool_weight) || pool_weight <= 0) { std::cin.clear(); @@ -174,36 +183,43 @@ std::string get_multipool_entry(bool& final) std::cout << "Invalid weight. Try 1, 10, 100, etc:" << std::endl; } - final = !read_yes_no("- Do you want to add another pool? (y/n)"); + final = !read_yes_no("- Do you want to add another pool? (y/N)", "N"); - return "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + - "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + - bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; + return "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + + "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + + bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; } inline void prompt_once(bool& prompted) { if(!prompted) { - std::cout<<"Please enter:"<> tmp; } currency = tmp; } - auto& pool = params::inst().poolURL; + auto pool = params::inst().poolURL; bool userSetPool = true; if(pool.empty()) { prompt_once(prompted); userSetPool = false; - std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl; + std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl; std::cin >> pool; } - auto& userName = params::inst().poolUsername; + auto userName = params::inst().poolUsername; if(userName.empty()) { prompt_once(prompted); - std::cout<<"- Username (wallet address or pool login):"<> userName; } bool stdin_flushed = false; - auto& passwd = params::inst().poolPasswd; + auto passwd = params::inst().poolPasswd; if(passwd.empty() && !params::inst().userSetPwd) { prompt_once(prompted); // clear everything from stdin to allow an empty password - std::cin.clear(); std::cin.ignore(INT_MAX,'\n'); + std::cin.clear(); + std::cin.ignore(INT_MAX, '\n'); stdin_flushed = true; - std::cout<<"- Password (mostly empty or x):"<> pool_weight) || pool_weight <= 0) { @@ -312,13 +332,11 @@ void do_guided_pool_config() std::cout << "Invalid weight. Try 1, 10, 100, etc:" << std::endl; } } - else - pool_weight = 1; std::string pool_table; - pool_table += "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + - "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + - bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; + pool_table += "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + + "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + + bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; if(multipool) { @@ -326,14 +344,13 @@ void do_guided_pool_config() do { pool_table += get_multipool_entry(final); - } - while(!final); + } while(!final); } configTpl.replace("CURRENCY", currency); configTpl.replace("POOLCONF", pool_table); configTpl.write(params::inst().configFilePools); - std::cout<<"Pool configuration stored in file '"<> port) || port < 0 || port > 65535) - { - std::cin.clear(); - std::cin.ignore(INT_MAX, '\n'); - std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl; + int32_t port; + while(!(std::cin >> port) || port < 0 || port > 65535) + { + std::cin.clear(); + std::cin.ignore(INT_MAX, '\n'); + std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl; + } + http_port = port; } - - http_port = port; #endif } configTpl.replace("HTTP_PORT", std::to_string(http_port)); + configTpl.replace("OUTPUT_FILE", params::inst().outputFile); + configTpl.replace("H_PRINT_TIME", std::to_string(params::inst().h_print_time > 0 ? params::inst().h_print_time : 300)); configTpl.write(params::inst().configFile); - std::cout<<"Configuration stored in file '"<= argc) + { + printer::inst()->print_msg(L0, "No argument for parameter '--amdGpus' given"); + win_exit(); + return 1; + } + params::inst().amdGpus = argv[i]; + } else if(opName.compare("--openCLVendor") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--openCLVendor' given"); win_exit(); @@ -481,10 +512,21 @@ int main(int argc, char *argv[]) { params::inst().useNVIDIA = false; } + else if (opName.compare("--nvidiaGpus") == 0) + { + ++i; + if (i >= argc) + { + printer::inst()->print_msg(L0, "No argument for parameter '--nvidiaGpus' given"); + win_exit(); + return 1; + } + params::inst().nvidiaGpus = argv[i]; + } else if(opName.compare("--cpu") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--cpu' given"); win_exit(); @@ -495,7 +537,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--amd") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--amd' given"); win_exit(); @@ -506,7 +548,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--amdCacheDir") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--amdCacheDir' given"); win_exit(); @@ -517,7 +559,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--nvidia") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--nvidia' given"); win_exit(); @@ -528,7 +570,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--currency") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--currency' given"); win_exit(); @@ -539,7 +581,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-o") == 0 || opName.compare("--url") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-o/--url' given"); win_exit(); @@ -551,7 +593,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-O") == 0 || opName.compare("--tls-url") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-O/--tls-url' given"); win_exit(); @@ -570,7 +612,7 @@ int main(int argc, char *argv[]) } ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-u/--user' given"); win_exit(); @@ -588,7 +630,7 @@ int main(int argc, char *argv[]) } ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-p/--pass' given"); win_exit(); @@ -607,7 +649,7 @@ int main(int argc, char *argv[]) } ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-r/--rigid' given"); win_exit(); @@ -624,7 +666,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-c") == 0 || opName.compare("--config") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-c/--config' given"); win_exit(); @@ -635,7 +677,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-C") == 0 || opName.compare("--poolconf") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-C/--poolconf' given"); win_exit(); @@ -643,10 +685,40 @@ int main(int argc, char *argv[]) } params::inst().configFilePools = argv[i]; } + else if(opName.compare("--log") == 0) + { + ++i; + if(i >= argc) + { + printer::inst()->print_msg(L0, "No argument for parameter '--log' given"); + win_exit(); + return 1; + } + params::inst().outputFile = argv[i]; + } + else if (opName.compare("--h-print-time") == 0) + { + ++i; + if (i >= argc) + { + printer::inst()->print_msg(L0, "No argument for parameter '--h-print-time' given"); + win_exit(); + return 1; + } + char* h_print_time = nullptr; + long int time = strtol(argv[i], &h_print_time, 10); + + if (time <= 0) + { + printer::inst()->print_msg(L0, "Hashrate print time must be > 0"); + return 1; + } + params::inst().h_print_time = time; + } else if(opName.compare("-i") == 0 || opName.compare("--httpd") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-i/--httpd' given"); win_exit(); @@ -672,7 +744,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--benchmark") == 0) { ++i; - if( i >= argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--benchmark' given"); win_exit(); @@ -691,7 +763,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--benchwait") == 0) { ++i; - if( i >= argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--benchwait' given"); win_exit(); @@ -710,7 +782,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--benchwork") == 0) { ++i; - if( i >= argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--benchwork' given"); win_exit(); @@ -728,17 +800,20 @@ int main(int argc, char *argv[]) } else { - printer::inst()->print_msg(L0, "Parameter unknown '%s'",argv[i]); + printer::inst()->print_msg(L0, "Parameter unknown '%s'", argv[i]); win_exit(); return 1; } } + bool hasConfigFile = configEditor::file_exist(params::inst().configFile); + bool hasPoolConfig = configEditor::file_exist(params::inst().configFilePools); + // check if we need a guided start - if(!configEditor::file_exist(params::inst().configFile)) + if(!hasConfigFile) do_guided_config(); - if(!configEditor::file_exist(params::inst().configFilePools)) + if(!hasPoolConfig) do_guided_pool_config(); if(!jconf::inst()->parse_config(params::inst().configFile.c_str(), params::inst().configFilePools.c_str())) @@ -759,7 +834,7 @@ int main(int argc, char *argv[]) if(strlen(jconf::inst()->GetOutputFile()) != 0) printer::inst()->open_logfile(jconf::inst()->GetOutputFile()); - if (!BackendConnector::self_test()) + if(!BackendConnector::self_test()) { printer::inst()->print_msg(L0, "Self test not passed!"); win_exit(); @@ -773,7 +848,7 @@ int main(int argc, char *argv[]) win_exit(); return 1; #else - if (!httpd::inst()->start_daemon()) + if(!httpd::inst()->start_daemon()) { win_exit(); return 1; @@ -847,7 +922,7 @@ int main(int argc, char *argv[]) uint64_t currentTime = get_timestamp_ms(); /* Hard guard to make sure we never get called more than twice per second */ - if( currentTime - lastTime < 500) + if(currentTime - lastTime < 500) std::this_thread::sleep_for(std::chrono::milliseconds(500 - (currentTime - lastTime))); lastTime = currentTime; } @@ -862,8 +937,14 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version); + if(block_version <= 0) + { + printer::inst()->print_msg(L0, "Block version must be >0, current value is %u.", block_version); + return 1; + } + uint8_t work[128]; - memset(work,0,128); + memset(work, 0, 128); work[0] = static_cast(block_version); xmrstak::pool_data dat; @@ -871,20 +952,20 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) xmrstak::miner_work oWork = xmrstak::miner_work(); pvThreads = xmrstak::BackendConnector::thread_starter(oWork); - printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec); + printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized", wait_sec); std::this_thread::sleep_for(std::chrono::seconds(wait_sec)); /* AMD and NVIDIA is currently only supporting work sizes up to 128byte */ - printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec); - xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat); + printer::inst()->print_msg(L0, "Start a %d second benchmark...", work_sec); + xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 1, 0), dat); uint64_t iStartStamp = get_timestamp_ms(); std::this_thread::sleep_for(std::chrono::seconds(work_sec)); xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat); double fTotalHps = 0.0; - for (uint32_t i = 0; i < pvThreads->size(); i++) + for(uint32_t i = 0; i < pvThreads->size(); i++) { double fHps = pvThreads->at(i)->iHashCount; fHps /= (pvThreads->at(i)->iTimestamp - iStartStamp) / 1000.0; @@ -892,7 +973,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) auto bType = static_cast(pvThreads->at(i)->backendType); std::string name(xmrstak::iBackend::getName(bType)); - printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i,name.c_str(), fHps); + printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i, name.c_str(), fHps); fTotalHps += fHps; } diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl index d8fd861a7..27b12c52f 100644 --- a/xmrstak/config.tpl +++ b/xmrstak/config.tpl @@ -43,7 +43,7 @@ R"===(// generated by XMRSTAK_VERSION * h_print_time - How often, in seconds, should we print a hashrate report if verbose_level is set to 4. * This option has no effect if verbose_level is not 4. */ -"h_print_time" : 300, +"h_print_time" : H_PRINT_TIME, /* * Manual hardware AES override @@ -129,7 +129,7 @@ R"===(// generated by XMRSTAK_VERSION * output_file - This option will log all output to a file. * */ -"output_file" : "", +"output_file" : "OUTPUT_FILE", /* * Built-in web server diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp index ed9abc2bc..b4f0f547e 100644 --- a/xmrstak/http/httpd.cpp +++ b/xmrstak/http/httpd.cpp @@ -23,16 +23,15 @@ #ifndef CONF_NO_HTTPD - #include "httpd.hpp" #include "webdesign.hpp" -#include "xmrstak/net/msgstruct.hpp" +#include "xmrstak/jconf.hpp" #include "xmrstak/misc/console.hpp" #include "xmrstak/misc/executor.hpp" -#include "xmrstak/jconf.hpp" +#include "xmrstak/net/msgstruct.hpp" -#include #include +#include #include #include @@ -45,21 +44,20 @@ httpd* httpd::oInst = nullptr; httpd::httpd() { - } -int httpd::req_handler(void * cls, - MHD_Connection* connection, - const char* url, - const char* method, - const char* version, - const char* upload_data, - size_t* upload_data_size, - void ** ptr) +int httpd::req_handler(void* cls, + MHD_Connection* connection, + const char* url, + const char* method, + const char* version, + const char* upload_data, + size_t* upload_data_size, + void** ptr) { - struct MHD_Response * rsp; + struct MHD_Response* rsp; - if (strcmp(method, "GET") != 0) + if(strcmp(method, "GET") != 0) return MHD_NO; if(strlen(jconf::inst()->GetHttpUsername()) != 0) @@ -68,7 +66,7 @@ int httpd::req_handler(void * cls, int ret; username = MHD_digest_auth_get_username(connection); - if (username == NULL) + if(username == NULL) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO); @@ -78,7 +76,7 @@ int httpd::req_handler(void * cls, free(username); ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300); - if (ret == MHD_INVALID_NONCE || ret == MHD_NO) + if(ret == MHD_INVALID_NONCE || ret == MHD_NO) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO); @@ -174,4 +172,3 @@ bool httpd::start_daemon() } #endif - diff --git a/xmrstak/http/httpd.hpp b/xmrstak/http/httpd.hpp index fe534f038..dfad082ca 100644 --- a/xmrstak/http/httpd.hpp +++ b/xmrstak/http/httpd.hpp @@ -7,27 +7,28 @@ struct MHD_Connection; class httpd { -public: + public: static httpd* inst() { - if (oInst == nullptr) oInst = new httpd; + if(oInst == nullptr) + oInst = new httpd; return oInst; }; bool start_daemon(); -private: + private: httpd(); static httpd* oInst; - static int req_handler(void * cls, - MHD_Connection* connection, - const char* url, - const char* method, - const char* version, - const char* upload_data, - size_t* upload_data_size, - void ** ptr); + static int req_handler(void* cls, + MHD_Connection* connection, + const char* url, + const char* method, + const char* version, + const char* upload_data, + size_t* upload_data_size, + void** ptr); - MHD_Daemon *d; + MHD_Daemon* d; }; diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index 8f20078aa..fbd565269 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -1,114 +1,114 @@ #include -extern const char sHtmlCssEtag [] = "00000009"; -extern const char sHtmlCssFile [] = +extern const char sHtmlCssEtag[] = "00000009"; +extern const char sHtmlCssFile[] = "body {" - "font-family: Tahoma, Arial, sans-serif;" - "font-size: 80%;" - "background-color: rgb(240, 240, 240);" + "font-family: Tahoma, Arial, sans-serif;" + "font-size: 80%;" + "background-color: rgb(240, 240, 240);" "}" "a {" - "color: rgb(44, 55, 66);" + "color: rgb(44, 55, 66);" "}" "a:link {" - "text-decoration: none;" + "text-decoration: none;" "}" "a:visited {" - "color: rgb(44, 55, 66);" + "color: rgb(44, 55, 66);" "}" "a:hover {" - "color: rgb(255, 153, 0);" + "color: rgb(255, 153, 0);" "}" "a:active {" - "color: rgb(204, 122, 0);" + "color: rgb(204, 122, 0);" "}" ".all {" - "max-width:600px;" - "margin: auto;" + "max-width:600px;" + "margin: auto;" "}" ".header {" - "background-color: rgb(30, 30, 30);" - "color: white;" - "padding: 10px;" - "font-weight: bold;" - "margin: 0px;" - "margin-bottom: 10px;" + "background-color: rgb(30, 30, 30);" + "color: white;" + "padding: 10px;" + "font-weight: bold;" + "margin: 0px;" + "margin-bottom: 10px;" "}" ".version {" - "font-size: 75%;" - "text-align: right;" + "font-size: 75%;" + "text-align: right;" "}" ".links {" - "padding: 7px;" - "text-align: center;" - "background-color: rgb(215, 215, 215);" - "box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);" + "padding: 7px;" + "text-align: center;" + "background-color: rgb(215, 215, 215);" + "box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);" "}" ".data th, td {" - "padding: 5px 12px;" - "text-align: right;" - "border-bottom: 1px solid #ccc;" + "padding: 5px 12px;" + "text-align: right;" + "border-bottom: 1px solid #ccc;" "}" ".data tr:nth-child(even) {" - "background-color: #ddd;" + "background-color: #ddd;" "}" ".data th {" - "background-color: #ccc;" + "background-color: #ccc;" "}" ".data table {" - "width: 100%;" - "max-width: 600px;" + "width: 100%;" + "max-width: 600px;" "}" ".letter {" - "font-weight: bold;" + "font-weight: bold;" "}" "h4 {" - "background-color: rgb(0, 130, 130);" - "color: white;" - "padding: 10px;" - "margin: 10px 0px;" + "background-color: rgb(0, 130, 130);" + "color: white;" + "padding: 10px;" + "margin: 10px 0px;" "}" ".flex-container {" - "display: -webkit-flex;" - "display: flex;" + "display: -webkit-flex;" + "display: flex;" "}" ".flex-item {" - "width: 33%;" - "margin: 3px;" + "width: 33%;" + "margin: 3px;" "}" ".motd-box {" - "background-color: #ccc;" - "padding: 0px 10px 5px 10px;" - "margin-bottom: 10px;" + "background-color: #ccc;" + "padding: 0px 10px 5px 10px;" + "margin-bottom: 10px;" "}" ".motd-head {" - "border-bottom: 1px solid #000;" - "margin-bottom: 0.5em;" - "padding: 0.5em 0em;" - "font-weight: bold;" + "border-bottom: 1px solid #000;" + "margin-bottom: 0.5em;" + "padding: 0.5em 0em;" + "font-weight: bold;" "}" ".motd-body {" - "overflow: hidden;" + "overflow: hidden;" "}"; size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1; @@ -124,7 +124,7 @@ extern const char sHtmlAccessDenied[] = size_t sHtmlAccessDeniedSize = sizeof(sHtmlAccessDenied) - 1; -extern const char sHtmlCommonHeader [] = +extern const char sHtmlCommonHeader[] = "" "" "" @@ -135,15 +135,15 @@ extern const char sHtmlCommonHeader [] = "
XMR-Stak Monero Miner
" "
" - "" - "" - "" + "" + "" + "" "
" "

%s

"; @@ -151,61 +151,61 @@ extern const char sHtmlMotdBoxStart[] = "
"; extern const char sHtmlMotdEntry[] = "
Message from %s
%s
"; extern const char sHtmlMotdBoxEnd[] = "
"; -extern const char sHtmlHashrateBodyHigh [] = +extern const char sHtmlHashrateBodyHigh[] = "
" "" - ""; + ""; -extern const char sHtmlHashrateTableRow [] = +extern const char sHtmlHashrateTableRow[] = ""; -extern const char sHtmlHashrateBodyLow [] = - "" - "" +extern const char sHtmlHashrateBodyLow[] = + "" + "" "
Thread ID10s60s15mH/s
Thread ID10s60s15mH/s
%s%s%s%s
Totals:%s%s%s
Highest:%s
Totals:%s%s%s
Highest:%s
" "
"; -extern const char sHtmlConnectionBodyHigh [] = +extern const char sHtmlConnectionBodyHigh[] = "
" "" - "" - "" - "" - "" + "" + "" + "" + "" "
Rig ID%s
Pool address%s
Connected since%s
Pool ping time%u ms
Rig ID%s
Pool address%s
Connected since%s
Pool ping time%u ms
" "

Network error log

" "" - ""; + ""; -extern const char sHtmlConnectionTableRow [] = +extern const char sHtmlConnectionTableRow[] = ""; -extern const char sHtmlConnectionBodyLow [] = +extern const char sHtmlConnectionBodyLow[] = "
DateError
DateError
%s%s
"; -extern const char sHtmlResultBodyHigh [] = +extern const char sHtmlResultBodyHigh[] = "
" "" - "" - "" - "" - "" - "" + "" + "" + "" + "" + "" "
Currency%s
Difficulty%u
Good results%u / %u (%.1f %%)
Avg result time%.1f sec
Pool-side hashes%u
Currency%s
Difficulty%u
Good results%u / %u (%.1f %%)
Avg result time%.1f sec
Pool-side hashes%u
" "

Top 10 best results found

" "" - "" - "" - "" - "" - "" + "" + "" + "" + "" + "" "
1%llu2%llu
3%llu4%llu
5%llu6%llu
7%llu8%llu
9%llu10%llu
1%llu2%llu
3%llu4%llu
5%llu6%llu
7%llu8%llu
9%llu10%llu
" "

Error details

" "" - "" - ""; + "" + ""; -extern const char sHtmlResultTableRow [] = +extern const char sHtmlResultTableRow[] = ""; extern const char sHtmlResultBodyLow[] = @@ -220,31 +220,30 @@ extern const char sJsonApiResultError[] = extern const char sJsonApiConnectionError[] = "{\"last_seen\":%llu,\"text\":\"%s\"}"; -extern const char sJsonApiFormat [] = -"{" +extern const char sJsonApiFormat[] = + "{" "\"version\":\"%s\"," "\"hashrate\":{" - "\"threads\":[%s]," - "\"total\":%s," - "\"highest\":%s" + "\"threads\":[%s]," + "\"total\":%s," + "\"highest\":%s" "}," "\"results\":{" - "\"diff_current\":%llu," - "\"shares_good\":%llu," - "\"shares_total\":%llu," - "\"avg_time\":%.1f," - "\"hashes_total\":%llu," - "\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu]," - "\"error_log\":[%s]" + "\"diff_current\":%llu," + "\"shares_good\":%llu," + "\"shares_total\":%llu," + "\"avg_time\":%.1f," + "\"hashes_total\":%llu," + "\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu]," + "\"error_log\":[%s]" "}," "\"connection\":{" - "\"pool\": \"%s\"," - "\"uptime\":%llu," - "\"ping\":%llu," - "\"error_log\":[%s]" + "\"pool\": \"%s\"," + "\"uptime\":%llu," + "\"ping\":%llu," + "\"error_log\":[%s]" "}" -"}"; - + "}"; diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 5e3384a63..c50211d1e 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -26,16 +26,15 @@ #include "xmrstak/misc/console.hpp" #include "xmrstak/misc/jext.hpp" -#include "xmrstak/misc/console.hpp" #include "xmrstak/misc/utility.hpp" +#include +#include +#include #include #include #include -#include #include -#include -#include #ifdef _WIN32 #define strcasecmp _stricmp @@ -44,18 +43,34 @@ #include #endif - using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { - aPoolList, sCurrency, bTlsSecureAlgo, iCallTimeout, iNetRetry, iGiveUpLimit, iVerboseLevel, bPrintMotd, iAutohashTime, - bDaemonMode, sOutputFile, iHttpdPort, sHttpLogin, sHttpPass, bPreferIpv4, bAesOverride, sUseSlowMem +enum configEnum +{ + aPoolList, + sCurrency, + bTlsSecureAlgo, + iCallTimeout, + iNetRetry, + iGiveUpLimit, + iVerboseLevel, + bPrintMotd, + iAutohashTime, + bDaemonMode, + sOutputFile, + iHttpdPort, + sHttpLogin, + sHttpPass, + bPreferIpv4, + bAesOverride, + sUseSlowMem }; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -64,68 +79,61 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aPoolList, "pool_list", kArrayType }, - { sCurrency, "currency", kStringType }, - { bTlsSecureAlgo, "tls_secure_algo", kTrueType }, - { iCallTimeout, "call_timeout", kNumberType }, - { iNetRetry, "retry_time", kNumberType }, - { iGiveUpLimit, "giveup_limit", kNumberType }, - { iVerboseLevel, "verbose_level", kNumberType }, - { bPrintMotd, "print_motd", kTrueType }, - { iAutohashTime, "h_print_time", kNumberType }, - { bDaemonMode, "daemon_mode", kTrueType }, - { sOutputFile, "output_file", kStringType }, - { iHttpdPort, "httpd_port", kNumberType }, - { sHttpLogin, "http_login", kStringType }, - { sHttpPass, "http_pass", kStringType }, - { bPreferIpv4, "prefer_ipv4", kTrueType }, - { bAesOverride, "aes_override", kNullType }, - { sUseSlowMem, "use_slow_memory", kStringType } -}; - -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + {aPoolList, "pool_list", kArrayType}, + {sCurrency, "currency", kStringType}, + {bTlsSecureAlgo, "tls_secure_algo", kTrueType}, + {iCallTimeout, "call_timeout", kNumberType}, + {iNetRetry, "retry_time", kNumberType}, + {iGiveUpLimit, "giveup_limit", kNumberType}, + {iVerboseLevel, "verbose_level", kNumberType}, + {bPrintMotd, "print_motd", kTrueType}, + {iAutohashTime, "h_print_time", kNumberType}, + {bDaemonMode, "daemon_mode", kTrueType}, + {sOutputFile, "output_file", kStringType}, + {iHttpdPort, "httpd_port", kNumberType}, + {sHttpLogin, "http_login", kStringType}, + {sHttpPass, "http_pass", kStringType}, + {bPreferIpv4, "prefer_ipv4", kTrueType}, + {bAesOverride, "aes_override", kNullType}, + {sUseSlowMem, "use_slow_memory", kStringType}}; + +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion - { "aeon7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555" }, - { "bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, - { "bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333" }, - { "cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_heavy", {POW(cryptonight_heavy)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_lite", {POW(cryptonight_lite)}, {POW(cryptonight_aeon)}, nullptr }, - { "cryptonight_lite_v7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, - { "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)}, {POW(cryptonight_aeon)}, nullptr }, - { "cryptonight_r", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr }, - { "cryptonight_superfast", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_turtle", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, - { "cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr }, - { "cryptonight_v8_double", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, - { "cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr }, - { "freehaven", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr }, - { "graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr }, - { "haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr }, - { "lethean", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, - { "masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, - { "monero", {POW(cryptonight_r)}, {POW(cryptonight_r)}, "pool.usxmrpool.com:3333" }, - { "qrl", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, - { "ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, - { "stellite", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, - { "turtlecoin", {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, - { "plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, - { "zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr }, - { "xcash", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr } -}; - -constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); + {"bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr}, + {"bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333"}, + {"cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_heavy", {POW(cryptonight_heavy)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_lite", {POW(cryptonight_lite)}, {POW(cryptonight_aeon)}, nullptr}, + {"cryptonight_lite_v7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr}, + {"cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)}, {POW(cryptonight_aeon)}, nullptr}, + {"cryptonight_r", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr}, + {"cryptonight_superfast", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_turtle", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr}, + {"cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr}, + {"cryptonight_v8_double", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"}, + {"cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr}, + {"graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}, + {"haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr}, + {"lethean", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr}, + {"masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr}, + {"qrl", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr}, + {"ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"}, + {"torque", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr}, + {"plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr}, + {"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}}; + +constexpr size_t coin_algo_size = (sizeof(coins) / sizeof(coins[0])); inline bool checkType(Type have, Type want) { @@ -242,7 +250,10 @@ bool jconf::PrintMotd() uint64_t jconf::GetAutohashTime() { - return prv->configValues[iAutohashTime]->GetUint64(); + if (xmrstak::params::inst().h_print_time == -1) + return prv->configValues[iAutohashTime]->GetUint64(); + else + return uint64_t(xmrstak::params::inst().h_print_time); } uint16_t jconf::GetHttpdPort() @@ -270,12 +281,15 @@ bool jconf::DaemonMode() const char* jconf::GetOutputFile() { - return prv->configValues[sOutputFile]->GetString(); + if(xmrstak::params::inst().outputFile.length() > 0) + return xmrstak::params::inst().outputFile.c_str(); + else + return prv->configValues[sOutputFile]->GetString(); } void jconf::cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) { - memset(val, 0, sizeof(int32_t)*4); + memset(val, 0, sizeof(int32_t) * 4); #ifdef _WIN32 __cpuidex(val, eax, ecx); @@ -326,7 +340,7 @@ std::string jconf::GetMiningCoin() void jconf::GetAlgoList(std::string& list) { list.reserve(256); - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { list += "\t- "; list += coins[i].coin_name; @@ -338,7 +352,7 @@ bool jconf::IsOnAlgoList(std::string& needle) { std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower); - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { if(needle == coins[i].coin_name) return true; @@ -350,7 +364,7 @@ const char* jconf::GetDefaultPool(const char* needle) { const char* default_example = "pool.example.com:3333"; - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { if(strcmp(needle, coins[i].coin_name) == 0) { @@ -366,22 +380,22 @@ const char* jconf::GetDefaultPool(const char* needle) bool jconf::parse_file(const char* sFilename, bool main_conf) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -396,7 +410,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -420,7 +434,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf) Document& root = main_conf ? prv->jsonDoc : prv->jsonDocPools; - root.Parse(buffer, flen+2); + root.Parse(buffer, flen + 2); free(buffer); if(root.HasParseError()) @@ -514,11 +528,11 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) std::vector pool_weights; pool_weights.reserve(pool_cnt); - const char* aPoolValues[] = { "pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight" }; - Type poolValTypes[] = { kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType }; + const char* aPoolValues[] = {"pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight"}; + Type poolValTypes[] = {kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType}; - constexpr size_t pvcnt = sizeof(aPoolValues)/sizeof(aPoolValues[0]); - for(uint32_t i=0; i < pool_cnt; i++) + constexpr size_t pvcnt = sizeof(aPoolValues) / sizeof(aPoolValues[0]); + for(uint32_t i = 0; i < pool_cnt; i++) { const Value& oThdConf = prv->configValues[aPoolList]->GetArray()[i]; @@ -528,7 +542,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) return false; } - for(uint32_t j=0; j < pvcnt; j++) + for(uint32_t j = 0; j < pvcnt; j++) { const Value* v; if((v = GetObjectMember(oThdConf, aPoolValues[j])) == nullptr) @@ -620,7 +634,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) return false; } - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { if(ctmp == coins[i].coin_name) { diff --git a/xmrstak/jconf.hpp b/xmrstak/jconf.hpp index 102b70f54..17cbf5b9a 100644 --- a/xmrstak/jconf.hpp +++ b/xmrstak/jconf.hpp @@ -1,26 +1,31 @@ #pragma once -#include "xmrstak/misc/environment.hpp" -#include "xmrstak/misc/coinDescription.hpp" #include "params.hpp" +#include "xmrstak/misc/coinDescription.hpp" +#include "xmrstak/misc/environment.hpp" #include #include class jconf { -public: + public: static jconf* inst() { auto& env = xmrstak::environment::inst(); if(env.pJconfConfig == nullptr) - env.pJconfConfig = new jconf; + { + std::unique_lock lck(env.update); + if(env.pJconfConfig == nullptr) + env.pJconfConfig = new jconf; + } return env.pJconfConfig; }; bool parse_config(const char* sFilename, const char* sFilenamePools); - struct pool_cfg { + struct pool_cfg + { const char* sPoolAddr; const char* sWalletAddr; const char* sRigId; @@ -38,7 +43,8 @@ class jconf uint64_t GetPoolCount(); bool GetPoolConfig(size_t id, pool_cfg& cfg); - enum slow_mem_cfg { + enum slow_mem_cfg + { always_use, no_mlck, print_warning, @@ -80,7 +86,7 @@ class jconf slow_mem_cfg GetSlowMemSetting(); -private: + private: jconf(); bool parse_file(const char* sFilename, bool main_conf); diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp index 65dee143c..b3b119226 100644 --- a/xmrstak/misc/coinDescription.hpp +++ b/xmrstak/misc/coinDescription.hpp @@ -2,86 +2,88 @@ #include "xmrstak/backend/cryptonight.hpp" +#include #include #include #include -#include namespace xmrstak { - struct coinDescription - { - xmrstak_algo algo = {xmrstak_algo_id::invalid_algo}; - uint8_t fork_version = 0u; - xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo}; +struct coinDescription +{ + xmrstak_algo algo = {xmrstak_algo_id::invalid_algo}; + uint8_t fork_version = 0u; + xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo}; - coinDescription() = default; + coinDescription() = default; - coinDescription( - const xmrstak_algo in_algo, - const uint8_t in_fork_version = 0, - xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo - ) : - algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version) - {} + coinDescription( + const xmrstak_algo in_algo, + const uint8_t in_fork_version = 0, + xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo) : + algo(in_algo), + algo_root(in_algo_root), + fork_version(in_fork_version) + { + } - inline xmrstak_algo GetMiningAlgo() const { return algo; } - inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; } - inline uint8_t GetMiningForkVersion() const { return fork_version; } - }; + inline xmrstak_algo GetMiningAlgo() const { return algo; } + inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; } + inline uint8_t GetMiningForkVersion() const { return fork_version; } +}; - struct coin_selection - { - const char* coin_name = nullptr; - /* [0] -> user pool +struct coin_selection +{ + const char* coin_name = nullptr; + /* [0] -> user pool * [1] -> dev pool */ - coinDescription pool_coin[2]; - const char* default_pool = nullptr; + coinDescription pool_coin[2]; + const char* default_pool = nullptr; - coin_selection() = default; + coin_selection() = default; - coin_selection( - const char* in_coin_name, - const coinDescription user_coinDescription, - const coinDescription dev_coinDescription, - const char* in_default_pool - ) : - coin_name(in_coin_name), default_pool(in_default_pool) - { - pool_coin[0] = user_coinDescription; - pool_coin[1] = dev_coinDescription; - } + coin_selection( + const char* in_coin_name, + const coinDescription user_coinDescription, + const coinDescription dev_coinDescription, + const char* in_default_pool) : + coin_name(in_coin_name), + default_pool(in_default_pool) + { + pool_coin[0] = user_coinDescription; + pool_coin[1] = dev_coinDescription; + } - /** get coin description for the pool + /** get coin description for the pool * * @param poolId 0 select dev pool, else the user pool is selected */ - inline coinDescription GetDescription(size_t poolId) const { - coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]); - return tmp; - } + inline coinDescription GetDescription(size_t poolId) const + { + coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]); + return tmp; + } - /** return all POW algorithm for the current selected currency + /** return all POW algorithm for the current selected currency * * @return required POW algorithms without duplicated entries */ - inline std::vector GetAllAlgorithms() - { - std::vector allAlgos = { - GetDescription(0).GetMiningAlgo(), - GetDescription(0).GetMiningAlgoRoot(), - GetDescription(1).GetMiningAlgo(), - GetDescription(1).GetMiningAlgoRoot() - }; + inline std::vector GetAllAlgorithms() + { + std::vector allAlgos = { + GetDescription(0).GetMiningAlgo(), + GetDescription(0).GetMiningAlgoRoot(), + GetDescription(1).GetMiningAlgo(), + GetDescription(1).GetMiningAlgoRoot()}; - std::sort(allAlgos.begin(), allAlgos.end()); - std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo); - auto last = std::unique(allAlgos.begin(), allAlgos.end()); - // remove duplicated algorithms - allAlgos.erase(last, allAlgos.end()); + std::sort(allAlgos.begin(), allAlgos.end()); + std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo); + auto last = std::unique(allAlgos.begin(), allAlgos.end()); + // remove duplicated algorithms + allAlgos.erase(last, allAlgos.end()); - return allAlgos; - } - }; + return allAlgos; + } +}; } // namespace xmrstak diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp index 3f79df44c..ae81f62c5 100644 --- a/xmrstak/misc/configEditor.hpp +++ b/xmrstak/misc/configEditor.hpp @@ -1,10 +1,10 @@ #pragma once #include -#include #include -#include #include +#include +#include #include "../version.hpp" @@ -17,16 +17,15 @@ struct configEditor configEditor() { - } - static bool file_exist( const std::string filename) + static bool file_exist(const std::string filename) { std::ifstream fstream(filename); return fstream.good(); } - void set( const std::string && content) + void set(const std::string&& content) { m_fileContent = content; } @@ -36,8 +35,7 @@ struct configEditor std::ifstream fstream(filename); m_fileContent = std::string( (std::istreambuf_iterator(fstream)), - std::istreambuf_iterator() - ); + std::istreambuf_iterator()); return fstream.good(); } @@ -70,7 +68,6 @@ struct configEditor { m_fileContent = std::regex_replace(m_fileContent, std::regex(search), substring); } - }; } // namespace xmrstak diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp index c39237eab..529cc9453 100644 --- a/xmrstak/misc/console.cpp +++ b/xmrstak/misc/console.cpp @@ -23,11 +23,11 @@ #include "xmrstak/misc/console.hpp" -#include +#include +#include #include #include -#include -#include +#include #ifdef _WIN32 #include @@ -37,15 +37,15 @@ int get_key() DWORD mode, rd; HANDLE h; - if ((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL) + if((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL) return -1; - GetConsoleMode( h, &mode ); - SetConsoleMode( h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT) ); + GetConsoleMode(h, &mode); + SetConsoleMode(h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT)); int c = 0; - ReadConsole( h, &c, 1, &rd, NULL ); - SetConsoleMode( h, mode ); + ReadConsole(h, &c, 1, &rd, NULL); + SetConsoleMode(h, mode); return c; } @@ -90,20 +90,20 @@ void reset_colour() } #else +#include #include #include -#include int get_key() { struct termios oldattr, newattr; int ch; - tcgetattr( STDIN_FILENO, &oldattr ); + tcgetattr(STDIN_FILENO, &oldattr); newattr = oldattr; - newattr.c_lflag &= ~( ICANON | ECHO ); - tcsetattr( STDIN_FILENO, TCSANOW, &newattr ); + newattr.c_lflag &= ~(ICANON | ECHO); + tcsetattr(STDIN_FILENO, TCSANOW, &newattr); ch = getchar(); - tcsetattr( STDIN_FILENO, TCSANOW, &oldattr ); + tcsetattr(STDIN_FILENO, TCSANOW, &oldattr); return ch; } @@ -182,17 +182,17 @@ void printer::print_msg(verbosity verbose, const char* fmt, ...) va_list args; va_start(args, fmt); - vsnprintf(buf+bpos, sizeof(buf)-bpos, fmt, args); + vsnprintf(buf + bpos, sizeof(buf) - bpos, fmt, args); va_end(args); bpos = strlen(buf); - if(bpos+2 >= sizeof(buf)) + if(bpos + 2 >= sizeof(buf)) return; buf[bpos] = '\n'; - buf[bpos+1] = '\0'; + buf[bpos + 1] = '\0'; - print_str(buf); + print_str(buf); } void printer::print_str(const char* str) diff --git a/xmrstak/misc/console.hpp b/xmrstak/misc/console.hpp index 6df6597c6..12efef6c7 100644 --- a/xmrstak/misc/console.hpp +++ b/xmrstak/misc/console.hpp @@ -4,8 +4,17 @@ #include - -enum out_colours { K_RED, K_GREEN, K_BLUE, K_YELLOW, K_CYAN, K_MAGENTA, K_WHITE, K_NONE }; +enum out_colours +{ + K_RED, + K_GREEN, + K_BLUE, + K_YELLOW, + K_CYAN, + K_MAGENTA, + K_WHITE, + K_NONE +}; // Warning - on Linux get_key will detect control keys, but not on Windows. // We will only use it for alphanum keys anyway. @@ -21,16 +30,29 @@ inline long long unsigned int int_port(size_t i) return i; } -enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LDEBUG = 10, LINF = 100}; +enum verbosity : size_t +{ + L0 = 0, + L1 = 1, + L2 = 2, + L3 = 3, + L4 = 4, + LDEBUG = 10, + LINF = 100 +}; class printer { -public: + public: static inline printer* inst() { auto& env = xmrstak::environment::inst(); if(env.pPrinter == nullptr) - env.pPrinter = new printer; + { + std::unique_lock lck(env.update); + if(env.pPrinter == nullptr) + env.pPrinter = new printer; + } return env.pPrinter; }; @@ -39,7 +61,7 @@ class printer void print_str(const char* str); bool open_logfile(const char* file); -private: + private: printer(); std::mutex print_mutex; diff --git a/xmrstak/misc/environment.cpp b/xmrstak/misc/environment.cpp new file mode 100644 index 000000000..9f1be511d --- /dev/null +++ b/xmrstak/misc/environment.cpp @@ -0,0 +1,19 @@ +#include "environment.hpp" + +#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cpu/crypto/cryptonight.h" +#include "xmrstak/params.hpp" +#include "xmrstak/misc/executor.hpp" +#include "xmrstak/jconf.hpp" + +namespace xmrstak +{ +void environment::init_singeltons() +{ + printer::inst(); + globalStates::inst(); + jconf::inst(); + executor::inst(); + params::inst(); +} +} diff --git a/xmrstak/misc/environment.hpp b/xmrstak/misc/environment.hpp index b67c85874..f37aedd61 100644 --- a/xmrstak/misc/environment.hpp +++ b/xmrstak/misc/environment.hpp @@ -1,5 +1,7 @@ #pragma once +#include + class printer; class jconf; class executor; @@ -19,7 +21,10 @@ struct environment if(env == nullptr) { if(init == nullptr) + { env = new environment; + env->init_singeltons(); + } else env = init; } @@ -36,6 +41,11 @@ struct environment jconf* pJconfConfig = nullptr; executor* pExecutor = nullptr; params* pParams = nullptr; + + std::mutex update; + +private: + void init_singeltons(); }; } // namespace xmrstak diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 79d4731e6..0266312d1 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -21,31 +21,30 @@ * */ -#include "xmrstak/jconf.hpp" #include "executor.hpp" +#include "xmrstak/jconf.hpp" #include "xmrstak/net/jpsock.hpp" #include "telemetry.hpp" -#include "xmrstak/backend/miner_work.hpp" -#include "xmrstak/backend/globalStates.hpp" #include "xmrstak/backend/backendConnector.hpp" +#include "xmrstak/backend/globalStates.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/donate-level.hpp" +#include "xmrstak/http/webdesign.hpp" #include "xmrstak/jconf.hpp" #include "xmrstak/misc/console.hpp" -#include "xmrstak/donate-level.hpp" #include "xmrstak/version.hpp" -#include "xmrstak/http/webdesign.hpp" -#include -#include -#include #include -#include #include +#include +#include +#include +#include #include - #ifdef _WIN32 #define strncasecmp _strnicmp #endif // _WIN32 @@ -63,7 +62,7 @@ void executor::push_timed_event(ex_event&& ev, size_t sec) void executor::ex_clock_thd() { size_t tick = 0; - while (true) + while(true) { std::this_thread::sleep_for(std::chrono::milliseconds(size_t(iTickTime))); @@ -76,7 +75,7 @@ void executor::ex_clock_thd() // Service timed events std::unique_lock lck(timed_event_mutex); std::list::iterator ev = lTimedEvents.begin(); - while (ev != lTimedEvents.end()) + while(ev != lTimedEvents.end()) { ev->ticks_left--; if(ev->ticks_left == 0) @@ -96,7 +95,8 @@ bool executor::get_live_pools(std::vector& eval_pools, bool is_dev) size_t limit = jconf::inst()->GetGiveUpLimit(); size_t wait = jconf::inst()->GetNetRetry(); - if(limit == 0 || is_dev) limit = (-1); //No limit = limit of 2^64-1 + if(limit == 0 || is_dev) + limit = (-1); //No limit = limit of 2^64-1 size_t pool_count = 0; size_t over_limit = 0; @@ -330,7 +330,7 @@ void executor::on_sock_ready(size_t pool_id) { if(pool->have_call_error() && !pool->is_dev_pool()) { - std::string str = "Login error: " + pool->get_call_error(); + std::string str = "Login error: " + pool->get_call_error(); log_socket_error(pool, std::move(str)); } @@ -369,7 +369,8 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob) dat.pool_id = pool_id; xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(oPoolJob.sJobID, oPoolJob.bWorkBlob, - oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), dat); + oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), + dat); if(dat.pool_id != pool_id) { @@ -420,12 +421,11 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult) //Ignore errors silently if(pool->is_running() && pool->is_logged_in()) pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, backend_name, - backend_hashcount, total_hashcount, oResult.algorithm - ); + backend_hashcount, total_hashcount, oResult.algorithm); return; } - if (!pool->is_running() || !pool->is_logged_in()) + if(!pool->is_running() || !pool->is_logged_in()) { log_result_error("[NETWORK ERROR]"); return; @@ -433,25 +433,42 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult) size_t t_start = get_timestamp_ms(); bool bResult = pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, - backend_name, backend_hashcount, total_hashcount, oResult.algorithm - ); + backend_name, backend_hashcount, total_hashcount, oResult.algorithm); size_t t_len = get_timestamp_ms() - t_start; if(t_len > 0xFFFF) t_len = 0xFFFF; iPoolCallTimes.push_back((uint16_t)t_len); + std::string name(backend_name); + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + if(bResult) { uint64_t* targets = (uint64_t*)oResult.bResult; log_result_ok(t64_to_diff(targets[3])); - printer::inst()->print_msg(L3, "Result accepted by the pool."); + + if (pvThreads->at(oResult.iThreadId)->backendType == xmrstak::iBackend::BackendType::CPU) + { + printer::inst()->print_msg(L3, "CPU: Share accepted. Pool: %s", pool->get_pool_addr()); + } + else + { + printer::inst()->print_msg(L3, "%s GPU %u: Share accepted. Pool: %s", name.c_str(), pvThreads->at(oResult.iThreadId)->iGpuIndex, pool->get_pool_addr()); + } } else { if(!pool->have_sock_error()) { - printer::inst()->print_msg(L3, "Result rejected by the pool."); + if (pvThreads->at(oResult.iThreadId)->backendType == xmrstak::iBackend::BackendType::CPU) + { + printer::inst()->print_msg(L3, "CPU: Share rejected. Pool: %s", pool->get_pool_addr()); + } + else + { + printer::inst()->print_msg(L3, "%s GPU %u: Share rejected. Pool: %s", name.c_str(), pvThreads->at(oResult.iThreadId)->iGpuIndex, pool->get_pool_addr()); + } std::string error = pool->get_call_error(); @@ -477,12 +494,14 @@ void disable_sigpipe() memset(&sa, 0, sizeof(sa)); sa.sa_handler = SIG_IGN; sa.sa_flags = 0; - if (sigaction(SIGPIPE, &sa, 0) == -1) + if(sigaction(SIGPIPE, &sa, 0) == -1) printer::inst()->print_msg(L1, "ERROR: Call to sigaction failed!"); } #else -inline void disable_sigpipe() {} +inline void disable_sigpipe() +{ +} #endif void executor::ex_main() @@ -496,7 +515,7 @@ void executor::ex_main() // \todo collect all backend threads pvThreads = xmrstak::BackendConnector::thread_starter(oWork); - if(pvThreads->size()==0) + if(pvThreads->size() == 0) { printer::inst()->print_msg(L1, "ERROR: No miner backend enabled."); win_exit(); @@ -508,11 +527,11 @@ void executor::ex_main() size_t pc = jconf::inst()->GetPoolCount(); bool dev_tls = true; bool already_have_cli_pool = false; - size_t i=0; + size_t i = 0; for(; i < pc; i++) { jconf::pool_cfg cfg; - jconf::inst()->GetPoolConfig(i, cfg); + jconf::inst()->GetPoolConfig(i, cfg); #ifdef CONF_NO_TLS if(cfg.tls) { @@ -520,7 +539,8 @@ void executor::ex_main() win_exit(); } #endif - if(!cfg.tls) dev_tls = false; + if(!cfg.tls) + dev_tls = false; if(!xmrstak::params::inst().poolURL.empty() && xmrstak::params::inst().poolURL == cfg.sPoolAddr) { @@ -531,11 +551,12 @@ void executor::ex_main() const char* rigid = params.userSetRigid ? params.poolRigid.c_str() : cfg.sRigId; const char* pwd = params.userSetPwd ? params.poolPasswd.c_str() : cfg.sPasswd; bool nicehash = cfg.nicehash || params.nicehashMode; + bool tls = params.poolUseTls; - pools.emplace_back(i+1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash); + pools.emplace_back(i + 1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, tls, cfg.tls_fingerprint, nicehash); } else - pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash); + pools.emplace_back(i + 1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash); } if(!xmrstak::params::inst().poolURL.empty() && !already_have_cli_pool) @@ -547,7 +568,7 @@ void executor::ex_main() win_exit(); } - pools.emplace_back(i+1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode); + pools.emplace_back(i + 1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode); } switch(jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo()) @@ -604,10 +625,10 @@ void executor::ex_main() push_timed_event(ex_event(EV_HASHRATE_LOOP), jconf::inst()->GetAutohashTime()); size_t cnt = 0; - while (true) + while(true) { ev = oEventQ.pop(); - switch (ev.iName) + switch(ev.iName) { case EV_SOCK_READY: on_sock_ready(ev.iPoolId); @@ -638,9 +659,9 @@ void executor::ex_main() } case EV_PERF_TICK: - for (i = 0; i < pvThreads->size(); i++) + for(i = 0; i < pvThreads->size(); i++) telem->push_perf_value(i, pvThreads->at(i)->iHashCount.load(std::memory_order_relaxed), - pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed)); + pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed)); if((cnt++ & 0xF) == 0) //Every 16 ticks { @@ -648,7 +669,7 @@ void executor::ex_main() double fTelem; bool normal = true; - for (i = 0; i < pvThreads->size(); i++) + for(i = 0; i < pvThreads->size(); i++) { fTelem = telem->calc_telemetry_data(10000, i); if(std::isnormal(fTelem)) @@ -709,7 +730,7 @@ bool executor::motd_filter_console(std::string& motd) if(motd.size() > motd_max_length) return false; - motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr)->bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n');}), motd.end()); + motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr) -> bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n'); }), motd.end()); return motd.size() > 0; } @@ -721,7 +742,7 @@ bool executor::motd_filter_web(std::string& motd) std::string tmp; tmp.reserve(motd.size() + 128); - for(size_t i=0; i < motd.size(); i++) + for(size_t i = 0; i < motd.size(); i++) { char c = motd[i]; switch(c) @@ -763,7 +784,7 @@ void executor::hashrate_report(std::string& out) std::string motd; for(jpsock& pool : pools) { - motd.empty(); + motd.clear(); if(pool.get_pool_motd(motd) && motd_filter_console(motd)) { out.append("Message from ").append(pool.get_pool_addr()).append(":\n"); @@ -774,17 +795,15 @@ void executor::hashrate_report(std::string& out) } char num[32]; - double fTotal[3] = { 0.0, 0.0, 0.0}; + double fTotal[3] = {0.0, 0.0, 0.0}; - for( uint32_t b = 0; b < 4u; ++b) + for(uint32_t b = 0; b < 4u; ++b) { std::vector backEnds; std::copy_if(pvThreads->begin(), pvThreads->end(), std::back_inserter(backEnds), - [&](xmrstak::iBackend* backend) - { + [&](xmrstak::iBackend* backend) { return backend->backendType == b; - } - ); + }); size_t nthd = backEnds.size(); if(nthd != 0) @@ -801,8 +820,8 @@ void executor::hashrate_report(std::string& out) else out.append(1, '\n'); - double fTotalCur[3] = { 0.0, 0.0, 0.0}; - for (i = 0; i < nthd; i++) + double fTotalCur[3] = {0.0, 0.0, 0.0}; + for(i = 0; i < nthd; i++) { double fHps[3]; @@ -883,12 +902,11 @@ void executor::result_report(std::string& out) size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes; size_t ln = vMineResults.size(); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) iTotalRes += vMineResults[i].count; out.append("RESULT REPORT\n"); - out.append("Currency : "). - append(jconf::inst()->GetMiningCoin()).append("\n"); + out.append("Currency : ").append(jconf::inst()->GetMiningCoin()).append("\n"); if(iTotalRes == 0) { out.append("You haven't found any results yet.\n"); @@ -904,8 +922,7 @@ void executor::result_report(std::string& out) snprintf(num, sizeof(num), " (%.1f %%)\n", 100.0 * iGoodRes / iTotalRes); out.append("Difficulty : ").append(std::to_string(iPoolDiff)).append(1, '\n'); - out.append("Good results : ").append(std::to_string(iGoodRes)).append(" / "). - append(std::to_string(iTotalRes)).append(num); + out.append("Good results : ").append(std::to_string(iGoodRes)).append(" / ").append(std::to_string(iTotalRes)).append(num); if(iPoolCallTimes.size() != 0) { @@ -916,10 +933,10 @@ void executor::result_report(std::string& out) out.append("Pool-side hashes : ").append(std::to_string(iPoolHashes)).append(2, '\n'); out.append("Top 10 best results found:\n"); - for(size_t i=0; i < 10; i += 2) + for(size_t i = 0; i < 10; i += 2) { snprintf(num, sizeof(num), "| %2llu | %16llu | %2llu | %16llu |\n", - int_port(i), int_port(iTopDiff[i]), int_port(i+1), int_port(iTopDiff[i+1])); + int_port(i), int_port(iTopDiff[i]), int_port(i + 1), int_port(iTopDiff[i + 1])); out.append(num); } @@ -927,7 +944,7 @@ void executor::result_report(std::string& out) if(ln > 1) { out.append("| Count | Error text | Last seen |\n"); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) { snprintf(num, sizeof(num), "| %5llu | %-32.32s | %s |\n", int_port(vMineResults[i].count), vMineResults[i].msg.c_str(), time_format(date, sizeof(date), vMineResults[i].time)); @@ -958,11 +975,11 @@ void executor::connection_report(std::string& out) out.append("Connected since : \n"); size_t n_calls = iPoolCallTimes.size(); - if (n_calls > 1) + if(n_calls > 1) { //Not-really-but-good-enough median - std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end()); - out.append("Pool ping time : ").append(std::to_string(iPoolCallTimes[n_calls/2])).append(" ms\n"); + std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end()); + out.append("Pool ping time : ").append(std::to_string(iPoolCallTimes[n_calls / 2])).append(" ms\n"); } else out.append("Pool ping time : (n/a)\n"); @@ -972,7 +989,7 @@ void executor::connection_report(std::string& out) if(ln > 0) { out.append("| Date | Error text |\n"); - for(size_t i=0; i < ln; i++) + for(size_t i = 0; i < ln; i++) { snprintf(num, sizeof(num), "| %s | %-54.54s |\n", time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str()); @@ -1024,7 +1041,7 @@ void executor::http_hashrate_report(std::string& out) std::string motd; for(jpsock& pool : pools) { - motd.empty(); + motd.clear(); if(pool.get_pool_motd(motd) && motd_filter_web(motd)) { if(!have_motd) @@ -1045,11 +1062,11 @@ void executor::http_hashrate_report(std::string& out) snprintf(buffer, sizeof(buffer), sHtmlHashrateBodyHigh, (unsigned int)nthd + 3); out.append(buffer); - double fTotal[3] = { 0.0, 0.0, 0.0}; + double fTotal[3] = {0.0, 0.0, 0.0}; auto bTypePrev = static_cast(0); std::string name; size_t j = 0; - for(size_t i=0; i < nthd; i++) + for(size_t i = 0; i < nthd; i++) { double fHps[3]; char csThreadTag[25]; @@ -1065,14 +1082,13 @@ void executor::http_hashrate_report(std::string& out) } snprintf(csThreadTag, sizeof(csThreadTag), (99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"), - name.c_str(), (unsigned int)(j), (unsigned int)i - ); + name.c_str(), (unsigned int)(j), (unsigned int)i); fHps[0] = telem->calc_telemetry_data(10000, i); fHps[1] = telem->calc_telemetry_data(60000, i); fHps[2] = telem->calc_telemetry_data(900000, i); - num_a[0] = num_b[0] = num_c[0] ='\0'; + num_a[0] = num_b[0] = num_c[0] = '\0'; hps_format(fHps[0], num_a, sizeof(num_a)); hps_format(fHps[1], num_b, sizeof(num_b)); hps_format(fHps[2], num_c, sizeof(num_c)); @@ -1085,7 +1101,7 @@ void executor::http_hashrate_report(std::string& out) out.append(buffer); } - num_a[0] = num_b[0] = num_c[0] = num_d[0] ='\0'; + num_a[0] = num_b[0] = num_c[0] = num_d[0] = '\0'; hps_format(fTotal[0], num_a, sizeof(num_a)); hps_format(fTotal[1], num_b, sizeof(num_b)); hps_format(fTotal[2], num_c, sizeof(num_c)); @@ -1102,13 +1118,13 @@ void executor::http_result_report(std::string& out) out.reserve(4096); - snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html, "Result Report"); + snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html, "Result Report"); out.append(buffer); size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes; size_t ln = vMineResults.size(); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) iTotalRes += vMineResults[i].count; double fGoodResPrc = 0.0; @@ -1119,8 +1135,7 @@ void executor::http_result_report(std::string& out) if(iPoolCallTimes.size() > 0) { using namespace std::chrono; - fAvgResTime = ((double)duration_cast(system_clock::now() - tPoolConnTime).count()) - / iPoolCallTimes.size(); + fAvgResTime = ((double)duration_cast(system_clock::now() - tPoolConnTime).count()) / iPoolCallTimes.size(); } snprintf(buffer, sizeof(buffer), sHtmlResultBodyHigh, @@ -1132,7 +1147,7 @@ void executor::http_result_report(std::string& out) out.append(buffer); - for(size_t i=1; i < vMineResults.size(); i++) + for(size_t i = 1; i < vMineResults.size(); i++) { snprintf(buffer, sizeof(buffer), sHtmlResultTableRow, vMineResults[i].msg.c_str(), int_port(vMineResults[i].count), time_format(date, sizeof(date), vMineResults[i].time)); @@ -1149,7 +1164,7 @@ void executor::http_connection_report(std::string& out) out.reserve(4096); - snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html, "Connection Report"); + snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html, "Connection Report"); out.append(buffer); jpsock* pool = pick_pool_by_id(current_pool_id); @@ -1157,16 +1172,16 @@ void executor::http_connection_report(std::string& out) pool = pick_pool_by_id(last_usr_pool_id); const char* cdate = "not connected"; - if (pool != nullptr && pool->is_running() && pool->is_logged_in()) + if(pool != nullptr && pool->is_running() && pool->is_logged_in()) cdate = time_format(date, sizeof(date), tPoolConnTime); size_t n_calls = iPoolCallTimes.size(); unsigned int ping_time = 0; - if (n_calls > 1) + if(n_calls > 1) { //Not-really-but-good-enough median - std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end()); - ping_time = iPoolCallTimes[n_calls/2]; + std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end()); + ping_time = iPoolCallTimes[n_calls / 2]; } snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh, @@ -1175,8 +1190,7 @@ void executor::http_connection_report(std::string& out) cdate, ping_time); out.append(buffer); - - for(size_t i=0; i < vSocketLog.size(); i++) + for(size_t i = 0; i < vSocketLog.size(); i++) { snprintf(buffer, sizeof(buffer), sHtmlConnectionTableRow, time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str()); @@ -1205,12 +1219,13 @@ void executor::http_json_report(std::string& out) std::string hr_thds, res_error, cn_error; size_t nthd = pvThreads->size(); - double fTotal[3] = { 0.0, 0.0, 0.0}; + double fTotal[3] = {0.0, 0.0, 0.0}; hr_thds.reserve(nthd * 32); - for(size_t i=0; i < nthd; i++) + for(size_t i = 0; i < nthd; i++) { - if(i != 0) hr_thds.append(1, ','); + if(i != 0) + hr_thds.append(1, ','); double fHps[3]; fHps[0] = telem->calc_telemetry_data(10000, i); @@ -1238,7 +1253,7 @@ void executor::http_json_report(std::string& out) size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes; size_t ln = vMineResults.size(); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) iTotalRes += vMineResults[i].count; jpsock* pool = pick_pool_by_id(current_pool_id); @@ -1258,10 +1273,11 @@ void executor::http_json_report(std::string& out) char buffer[2048]; res_error.reserve((vMineResults.size() - 1) * 128); - for(size_t i=1; i < vMineResults.size(); i++) + for(size_t i = 1; i < vMineResults.size(); i++) { using namespace std::chrono; - if(i != 1) res_error.append(1, ','); + if(i != 1) + res_error.append(1, ','); snprintf(buffer, sizeof(buffer), sJsonApiResultError, int_port(vMineResults[i].count), int_port(duration_cast(vMineResults[i].time.time_since_epoch()).count()), @@ -1271,18 +1287,19 @@ void executor::http_json_report(std::string& out) size_t n_calls = iPoolCallTimes.size(); size_t iPoolPing = 0; - if (n_calls > 1) + if(n_calls > 1) { //Not-really-but-good-enough median - std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end()); - iPoolPing = iPoolCallTimes[n_calls/2]; + std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end()); + iPoolPing = iPoolCallTimes[n_calls / 2]; } cn_error.reserve(vSocketLog.size() * 256); - for(size_t i=0; i < vSocketLog.size(); i++) + for(size_t i = 0; i < vSocketLog.size(); i++) { using namespace std::chrono; - if(i != 0) cn_error.append(1, ','); + if(i != 0) + cn_error.append(1, ','); snprintf(buffer, sizeof(buffer), sJsonApiConnectionError, int_port(duration_cast(vSocketLog[i].time.time_since_epoch()).count()), @@ -1291,7 +1308,7 @@ void executor::http_json_report(std::string& out) } size_t bb_size = 2048 + hr_thds.size() + res_error.size() + cn_error.size(); - std::unique_ptr bigbuf( new char[ bb_size ] ); + std::unique_ptr bigbuf(new char[bb_size]); int bb_len = snprintf(bigbuf.get(), bb_size, sJsonApiFormat, get_version_str().c_str(), hr_thds.c_str(), hr_buffer, a, @@ -1338,8 +1355,7 @@ void executor::get_http_report(ex_event_name ev_id, std::string& data) std::lock_guard lck(httpMutex); assert(pHttpString == nullptr); - assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS - || ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON); + assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS || ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON); pHttpString = &data; httpReady = std::promise(); diff --git a/xmrstak/misc/executor.hpp b/xmrstak/misc/executor.hpp index be5ee6c2f..385b2f4e3 100644 --- a/xmrstak/misc/executor.hpp +++ b/xmrstak/misc/executor.hpp @@ -1,18 +1,18 @@ #pragma once -#include "thdq.hpp" #include "telemetry.hpp" +#include "thdq.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/donate-level.hpp" #include "xmrstak/misc/environment.hpp" #include "xmrstak/net/msgstruct.hpp" -#include "xmrstak/donate-level.hpp" -#include #include +#include +#include +#include #include #include -#include -#include class jpsock; @@ -27,12 +27,16 @@ class minethd; class executor { -public: + public: static executor* inst() { auto& env = xmrstak::environment::inst(); if(env.pExecutor == nullptr) - env.pExecutor = new executor; + { + std::unique_lock lck(env.update); + if(env.pExecutor == nullptr) + env.pExecutor = new executor; + } return env.pExecutor; }; @@ -43,13 +47,15 @@ class executor inline void push_event(ex_event&& ev) { oEventQ.push(std::move(ev)); } void push_timed_event(ex_event&& ev, size_t sec); -private: + private: struct timed_event { ex_event event; size_t ticks_left; - timed_event(ex_event&& ev, size_t ticks) : event(std::move(ev)), ticks_left(ticks) {} + timed_event(ex_event&& ev, size_t ticks) : + event(std::move(ev)), + ticks_left(ticks) {} }; inline void set_timestamp() { dev_timestamp = get_timestamp(); }; @@ -119,7 +125,8 @@ class executor std::chrono::system_clock::time_point time; std::string msg; - sck_error_log(std::string&& err) : msg(std::move(err)) + sck_error_log(std::string&& err) : + msg(std::move(err)) { time = std::chrono::system_clock::now(); } @@ -134,12 +141,16 @@ class executor std::string msg; size_t count; - result_tally() : msg("[OK]"), count(0) + result_tally() : + msg("[OK]"), + count(0) { time = std::chrono::system_clock::now(); } - result_tally(std::string&& err) : msg(std::move(err)), count(1) + result_tally(std::string&& err) : + msg(std::move(err)), + count(1) { time = std::chrono::system_clock::now(); } @@ -161,7 +172,7 @@ class executor std::vector vMineResults; //More result statistics - std::array iTopDiff { { } }; //Initialize to zero + std::array iTopDiff{{}}; //Initialize to zero std::chrono::system_clock::time_point tPoolConnTime; size_t iPoolHashes = 0; @@ -195,4 +206,3 @@ class executor inline size_t sec_to_ticks(size_t sec) { return sec * (1000 / iTickTime); } }; - diff --git a/xmrstak/misc/home_dir.hpp b/xmrstak/misc/home_dir.hpp index 8eb0fa4ea..836c7cc4e 100644 --- a/xmrstak/misc/home_dir.hpp +++ b/xmrstak/misc/home_dir.hpp @@ -4,39 +4,40 @@ #ifdef _WIN32 #include +// this comment avoid that clang format reorders the includes #include namespace { - inline std::string get_home() +inline std::string get_home() +{ + char path[MAX_PATH + 1]; + // get folder "appdata\local" + if(SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE)) { - char path[MAX_PATH + 1]; - // get folder "appdata\local" - if (SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE)) - { - return path; - } - else - return "."; + return path; } -} // namespace anonymous + else + return "."; +} +} // namespace #else -#include -#include #include +#include +#include namespace { - inline std::string get_home() - { - const char *home = "."; +inline std::string get_home() +{ + const char* home = "."; - if ((home = getenv("HOME")) == nullptr) - home = getpwuid(getuid())->pw_dir; + if((home = getenv("HOME")) == nullptr) + home = getpwuid(getuid())->pw_dir; - return home; - } -} // namespace anonymous + return home; +} +} // namespace #endif // _WIN32 diff --git a/xmrstak/misc/jext.hpp b/xmrstak/misc/jext.hpp index 9936fa813..421508989 100644 --- a/xmrstak/misc/jext.hpp +++ b/xmrstak/misc/jext.hpp @@ -9,7 +9,7 @@ using namespace rapidjson; inline const Value* GetObjectMember(const Value& obj, const char* key) { Value::ConstMemberIterator itr = obj.FindMember(key); - if (itr != obj.MemberEnd()) + if(itr != obj.MemberEnd()) return &itr->value; else return nullptr; @@ -48,8 +48,8 @@ inline const Value* GetObjectMember(const Value& obj, const char* key) #elif defined(__NetBSD__) -#include #include +#include #if defined(__BSWAP_RENAME) && !defined(__bswap_32) #define bswap_32(x) bswap32(x) #define bswap_64(x) bswap64(x) diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp index 47442df09..ea73e6281 100644 --- a/xmrstak/misc/telemetry.cpp +++ b/xmrstak/misc/telemetry.cpp @@ -24,9 +24,9 @@ #include "telemetry.hpp" #include "xmrstak/net/msgstruct.hpp" +#include #include #include -#include namespace xmrstak { @@ -36,9 +36,8 @@ telemetry::telemetry(size_t iThd) ppHashCounts = new uint64_t*[iThd]; ppTimestamps = new uint64_t*[iThd]; iBucketTop = new uint32_t[iThd]; - mtx = new std::mutex[iThd]; - for (size_t i = 0; i < iThd; i++) + for(size_t i = 0; i < iThd; i++) { ppHashCounts[i] = new uint64_t[iBucketSize]; ppTimestamps[i] = new uint64_t[iBucketSize]; @@ -51,31 +50,29 @@ telemetry::telemetry(size_t iThd) double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) { - uint64_t iEarliestHashCnt = 0; uint64_t iEarliestStamp = 0; uint64_t iLatestStamp = 0; uint64_t iLatestHashCnt = 0; bool bHaveFullSet = false; - std::unique_lock lk(mtx[iThread]); uint64_t iTimeNow = get_timestamp_ms(); //Start at 1, buckettop points to next empty - for (size_t i = 1; i < iBucketSize; i++) + for(size_t i = 1; i < iBucketSize; i++) { size_t idx = (iBucketTop[iThread] - i) & iBucketMask; //overflow expected here - if (ppTimestamps[iThread][idx] == 0) + if(ppTimestamps[iThread][idx] == 0) break; //That means we don't have the data yet - if (iLatestStamp == 0) + if(iLatestStamp == 0) { iLatestStamp = ppTimestamps[iThread][idx]; iLatestHashCnt = ppHashCounts[iThread][idx]; } - if (iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec) + if(iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec) { bHaveFullSet = true; break; //We are out of the requested time period @@ -84,13 +81,12 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) iEarliestStamp = ppTimestamps[iThread][idx]; iEarliestHashCnt = ppHashCounts[iThread][idx]; } - lk.unlock(); - if (!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0) + if(!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0) return nan(""); //Don't think that can happen, but just in case - if (iLatestStamp - iEarliestStamp == 0) + if(iLatestStamp - iEarliestStamp == 0) return nan(""); double fHashes, fTime; @@ -103,7 +99,6 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp) { - std::unique_lock lk(mtx[iThd]); size_t iTop = iBucketTop[iThd]; ppHashCounts[iThd][iTop] = iHashCount; ppTimestamps[iThd][iTop] = iTimestamp; diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp index 580565de2..fb87bcd32 100644 --- a/xmrstak/misc/telemetry.hpp +++ b/xmrstak/misc/telemetry.hpp @@ -2,20 +2,18 @@ #include #include -#include namespace xmrstak { class telemetry { -public: + public: telemetry(size_t iThd); void push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp); double calc_telemetry_data(size_t iLastMillisec, size_t iThread); -private: - std::mutex* mtx; + private: constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations constexpr static size_t iBucketMask = iBucketSize - 1; uint32_t* iBucketTop; diff --git a/xmrstak/misc/thdq.hpp b/xmrstak/misc/thdq.hpp index 7a4a5cfe4..2eef30bcf 100644 --- a/xmrstak/misc/thdq.hpp +++ b/xmrstak/misc/thdq.hpp @@ -1,31 +1,37 @@ #pragma once - + +#include +#include #include #include -#include -#include - + template class thdq { -public: + public: T pop() { std::unique_lock mlock(mutex_); - while (queue_.empty()) { cond_.wait(mlock); } + while(queue_.empty()) + { + cond_.wait(mlock); + } auto item = std::move(queue_.front()); queue_.pop(); return item; } - + void pop(T& item) { std::unique_lock mlock(mutex_); - while (queue_.empty()) { cond_.wait(mlock); } + while(queue_.empty()) + { + cond_.wait(mlock); + } item = queue_.front(); queue_.pop(); } - + void push(const T& item) { std::unique_lock mlock(mutex_); @@ -33,7 +39,7 @@ class thdq mlock.unlock(); cond_.notify_one(); } - + void push(T&& item) { std::unique_lock mlock(mutex_); @@ -41,9 +47,9 @@ class thdq mlock.unlock(); cond_.notify_one(); } - -private: + + private: std::queue queue_; std::mutex mutex_; std::condition_variable cond_; -}; +}; diff --git a/xmrstak/misc/uac.cpp b/xmrstak/misc/uac.cpp index 9f940933c..0e4f91c7b 100644 --- a/xmrstak/misc/uac.cpp +++ b/xmrstak/misc/uac.cpp @@ -1,6 +1,7 @@ #ifdef _WIN32 #include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" +#include "xmrstak/jconf.hpp" #include #include @@ -9,24 +10,24 @@ BOOL IsElevated() { BOOL fRet = FALSE; HANDLE hToken = NULL; - if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken)) + if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken)) { TOKEN_ELEVATION Elevation; DWORD cbSize = sizeof(TOKEN_ELEVATION); - if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) + if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) fRet = Elevation.TokenIsElevated; } - if (hToken) + if(hToken) CloseHandle(hToken); return fRet; } BOOL SelfElevate(const std::string& my_path, const std::string& params) { - if (IsElevated()) + if(IsElevated()) return FALSE; - SHELLEXECUTEINFO shExecInfo = { 0 }; + SHELLEXECUTEINFO shExecInfo = {0}; shExecInfo.cbSize = sizeof(SHELLEXECUTEINFO); shExecInfo.fMask = SEE_MASK_NOCLOSEPROCESS; shExecInfo.hwnd = NULL; @@ -37,7 +38,7 @@ BOOL SelfElevate(const std::string& my_path, const std::string& params) shExecInfo.nShow = SW_SHOW; shExecInfo.hInstApp = NULL; - if (!ShellExecuteEx(&shExecInfo)) + if(!ShellExecuteEx(&shExecInfo)) return FALSE; // Loiter in the background to make scripting easier @@ -56,6 +57,9 @@ VOID RequestElevation() if(!xmrstak::params::inst().allowUAC) { printer::inst()->print_msg(L0, "The miner needs to run as administrator, but you passed --noUAC option. Please remove it or set use_slow_memory to always."); + if (::jconf::inst()->GetSlowMemSetting() == ::jconf::print_warning) + return; + win_exit(); return; } @@ -65,13 +69,13 @@ VOID RequestElevation() BOOL IsWindows10OrNewer() { - OSVERSIONINFOEX osvi = { 0 }; - osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); - osvi.dwMajorVersion = 10; - osvi.dwMinorVersion = 0; - DWORDLONG dwlConditionMask = 0; - VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL); - VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL); - return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask); + OSVERSIONINFOEX osvi = {0}; + osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); + osvi.dwMajorVersion = 10; + osvi.dwMinorVersion = 0; + DWORDLONG dwlConditionMask = 0; + VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL); + VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL); + return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask); } #endif diff --git a/xmrstak/misc/utility.cpp b/xmrstak/misc/utility.cpp index 5177d14c2..bf665bda3 100644 --- a/xmrstak/misc/utility.cpp +++ b/xmrstak/misc/utility.cpp @@ -1,21 +1,15 @@ -#include #include - +#include namespace xmrstak { - bool strcmp_i(const std::string& str1, const std::string& str2) - { - if(str1.size() != str2.size()) - return false; - else - return (str1.empty() | str2.empty()) ? - false : - std::equal(str1.begin(), str1.end(),str2.begin(), - [](char c1, char c2) - { - return ::tolower(c1) == ::tolower(c2); - } - ); - } +bool strcmp_i(const std::string& str1, const std::string& str2) +{ + if(str1.size() != str2.size()) + return false; + else + return (str1.empty() | str2.empty()) ? false : std::equal(str1.begin(), str1.end(), str2.begin(), [](char c1, char c2) { + return ::tolower(c1) == ::tolower(c2); + }); +} } // namespace xmrstak diff --git a/xmrstak/misc/utility.hpp b/xmrstak/misc/utility.hpp index 8f2e99fb8..0eb08993d 100644 --- a/xmrstak/misc/utility.hpp +++ b/xmrstak/misc/utility.hpp @@ -4,9 +4,9 @@ namespace xmrstak { - /** case insensitive string compare +/** case insensitive string compare * * @return true if both strings are equal, else false */ - bool strcmp_i(const std::string& str1, const std::string& str2); +bool strcmp_i(const std::string& str1, const std::string& str2); } // namespace xmrstak diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 786b18b4f..f9522962f 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -21,17 +21,17 @@ * */ -#include -#include #include +#include #include +#include #include "jpsock.hpp" -#include "socks.hpp" #include "socket.hpp" +#include "socks.hpp" -#include "xmrstak/misc/executor.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/executor.hpp" #include "xmrstak/misc/jext.hpp" #include "xmrstak/version.hpp" @@ -45,7 +45,9 @@ struct jpsock::call_rsp std::string sCallErr; uint64_t iMessageId; - call_rsp(Value* val) : pCallData(val), iMessageId(0) + call_rsp(Value* val) : + pCallData(val), + iMessageId(0) { bHaveResponse = false; iCallId = 0; @@ -70,7 +72,7 @@ typedef GenericDocument, MemoryPoolAllocator<>, MemoryPoolAllocator<>> Me struct jpsock::opaque_private { - Value oCallValue; + Value oCallValue; MemoryPoolAllocator<> callAllocator; MemoryPoolAllocator<> recvAllocator; @@ -91,12 +93,24 @@ struct jpsock::opaque_private struct jpsock::opq_json_val { const Value* val; - opq_json_val(const Value* val) : val(val) {} + opq_json_val(const Value* val) : + val(val) {} }; jpsock::jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash) : - net_addr(sAddr), usr_login(sLogin), usr_rigid(sRigId), usr_pass(sPassword), tls_fp(tls_fp), pool_id(id), pool_weight(pool_weight), pool(dev_pool), nicehash(nicehash), - connect_time(0), connect_attempts(0), disconnect_time(0), quiet_close(false) + net_addr(sAddr), + usr_login(sLogin), + usr_rigid(sRigId), + usr_pass(sPassword), + tls_fp(tls_fp), + pool_id(id), + pool_weight(pool_weight), + pool(dev_pool), + nicehash(nicehash), + connect_time(0), + connect_attempts(0), + disconnect_time(0), + quiet_close(false) { sock_init(); @@ -245,7 +259,7 @@ bool jpsock::jpsock_thd_main() char buf[iSockBufferSize]; size_t datalen = 0; - while (true) + while(true) { int ret = sck->recv(buf + datalen, sizeof(buf) - datalen); @@ -254,7 +268,7 @@ bool jpsock::jpsock_thd_main() datalen += ret; - if (datalen >= sizeof(buf)) + if(datalen >= sizeof(buf)) { sck->close(false); return set_socket_error("RECEIVE error: data overflow"); @@ -262,12 +276,12 @@ bool jpsock::jpsock_thd_main() char* lnend; char* lnstart = buf; - while ((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr) + while((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr) { lnend++; int lnlen = lnend - lnstart; - if (!process_line(lnstart, lnlen)) + if(!process_line(lnstart, lnlen)) { sck->close(false); return false; @@ -278,7 +292,7 @@ bool jpsock::jpsock_thd_main() } //Got leftover data? Move it to the front - if (datalen > 0 && buf != lnstart) + if(datalen > 0 && buf != lnstart) memmove(buf, lnstart, datalen); } } @@ -291,18 +305,18 @@ bool jpsock::process_line(char* line, size_t len) ++iMessageCnt; /*NULL terminate the line instead of '\n', parsing will add some more NULLs*/ - line[len-1] = '\0'; + line[len - 1] = '\0'; //printf("RECV: %s\n", line); - if (prv->jsonDoc.ParseInsitu(line).HasParseError()) + if(prv->jsonDoc.ParseInsitu(line).HasParseError()) return set_socket_error("PARSE error: Invalid JSON"); - if (!prv->jsonDoc.IsObject()) + if(!prv->jsonDoc.IsObject()) return set_socket_error("PARSE error: Invalid root"); const Value* mt; - if (prv->jsonDoc.HasMember("method")) + if(prv->jsonDoc.HasMember("method")) { mt = GetObjectMember(prv->jsonDoc, "method"); @@ -329,7 +343,7 @@ bool jpsock::process_line(char* line, size_t len) { uint64_t iCallId; mt = GetObjectMember(prv->jsonDoc, "id"); - if (mt == nullptr || !mt->IsUint64()) + if(mt == nullptr || !mt->IsUint64()) return set_socket_error("PARSE error: Protocol error 3"); iCallId = mt->GetUint64(); @@ -338,10 +352,10 @@ bool jpsock::process_line(char* line, size_t len) const char* sError = nullptr; size_t iErrorLen = 0; - if (mt == nullptr || mt->IsNull()) + if(mt == nullptr || mt->IsNull()) { /* If there was no error we need a result */ - if ((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr) + if((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr) return set_socket_error("PARSE error: Protocol error 7"); } else @@ -359,7 +373,7 @@ bool jpsock::process_line(char* line, size_t len) } std::unique_lock mlock(call_mutex); - if (prv->oCallRsp.pCallData == nullptr) + if(prv->oCallRsp.pCallData == nullptr) { /*Server sent us a call reply without us making a call*/ mlock.unlock(); @@ -400,7 +414,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message mlock.unlock(); - if (!params->val->IsObject()) + if(!params->val->IsObject()) return set_socket_error("PARSE error: Job error 1"); const Value *blob, *jobid, *target, *motd, *blk_height; @@ -410,7 +424,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message motd = GetObjectMember(*params->val, "motd"); blk_height = GetObjectMember(*params->val, "height"); - if (jobid == nullptr || blob == nullptr || target == nullptr || + if(jobid == nullptr || blob == nullptr || target == nullptr || !jobid->IsString() || !blob->IsString() || !target->IsString()) { return set_socket_error("PARSE error: Job error 2"); @@ -421,7 +435,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message std::unique_lock lck(motd_mutex); if(motd->GetStringLength() > 0) { - pool_motd.resize(motd->GetStringLength()/2 + 1); + pool_motd.resize(motd->GetStringLength() / 2 + 1); if(!hex2bin(motd->GetString(), motd->GetStringLength(), (unsigned char*)&pool_motd.front())) pool_motd.clear(); } @@ -429,7 +443,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message pool_motd.clear(); } - if (jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >= + if(jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >= return set_socket_error("PARSE error: Job error 3"); pool_job oPoolJob; @@ -437,10 +451,10 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message const uint32_t iWorkLen = blob->GetStringLength() / 2; oPoolJob.iWorkLen = iWorkLen; - if (iWorkLen > sizeof(pool_job::bWorkBlob)) + if(iWorkLen > sizeof(pool_job::bWorkBlob)) return set_socket_error("PARSE error: Invalid job length. Are you sure you are mining the correct coin?"); - if (!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob)) + if(!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob)) return set_socket_error("PARSE error: Job error 4"); // lock reading of oCurrentJob @@ -479,7 +493,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message return set_socket_error("PARSE error: Job error 5"); iJobDiff = t64_to_diff(oPoolJob.iTarget); - + if(blk_height != nullptr && blk_height->IsUint64()) oPoolJob.iBlockHeight = bswap_64(blk_height->GetUint64()); @@ -589,10 +603,10 @@ bool jpsock::cmd_login() uint64_t messageId = 0; /*Normal error conditions (failed login etc..) will end here*/ - if (!cmd_ret_wait(cmd_buffer, oResult, messageId)) + if(!cmd_ret_wait(cmd_buffer, oResult, messageId)) return false; - if (!oResult.val->IsObject()) + if(!oResult.val->IsObject()) { set_socket_error("PARSE error: Login protocol error 1"); disconnect(); @@ -603,14 +617,14 @@ bool jpsock::cmd_login() const Value* job = GetObjectMember(*oResult.val, "job"); const Value* ext = GetObjectMember(*oResult.val, "extensions"); - if (id == nullptr || job == nullptr || !id->IsString()) + if(id == nullptr || job == nullptr || !id->IsString()) { set_socket_error("PARSE error: Login protocol error 2"); disconnect(); return false; } - if (id->GetStringLength() >= sizeof(sMinerId)) + if(id->GetStringLength() >= sizeof(sMinerId)) { set_socket_error("PARSE error: Login protocol error 3"); disconnect(); @@ -622,7 +636,7 @@ bool jpsock::cmd_login() if(ext != nullptr && ext->IsArray()) { - for(size_t i=0; i < ext->Size(); i++) + for(size_t i = 0; i < ext->Size(); i++) { const Value& jextname = ext->GetArray()[i]; @@ -693,7 +707,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes sResult[64] = '\0'; snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n", - sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations,sMemory, sMemAlignBytes); + sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations, sMemory, sMemAlignBytes); uint64_t messageId = 0; opq_json_val oResult(nullptr); @@ -732,13 +746,13 @@ bool jpsock::get_pool_motd(std::string& strin) return false; } -inline unsigned char hf_hex2bin(char c, bool &err) +inline unsigned char hf_hex2bin(char c, bool& err) { - if (c >= '0' && c <= '9') + if(c >= '0' && c <= '9') return c - '0'; - else if (c >= 'a' && c <= 'f') + else if(c >= 'a' && c <= 'f') return c - 'a' + 0xA; - else if (c >= 'A' && c <= 'F') + else if(c >= 'A' && c <= 'F') return c - 'A' + 0xA; err = true; @@ -748,17 +762,18 @@ inline unsigned char hf_hex2bin(char c, bool &err) bool jpsock::hex2bin(const char* in, unsigned int len, unsigned char* out) { bool error = false; - for (unsigned int i = 0; i < len; i += 2) + for(unsigned int i = 0; i < len; i += 2) { out[i / 2] = (hf_hex2bin(in[i], error) << 4) | hf_hex2bin(in[i + 1], error); - if (error) return false; + if(error) + return false; } return true; } inline char hf_bin2hex(unsigned char c) { - if (c <= 0x9) + if(c <= 0x9) return '0' + c; else return 'a' - 0xA + c; @@ -766,7 +781,7 @@ inline char hf_bin2hex(unsigned char c) void jpsock::bin2hex(const unsigned char* in, unsigned int len, char* out) { - for (unsigned int i = 0; i < len; i++) + for(unsigned int i = 0; i < len; i++) { out[i * 2] = hf_bin2hex((in[i] & 0xF0) >> 4); out[i * 2 + 1] = hf_bin2hex(in[i] & 0x0F); diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp index 949764813..4ad6ebbbc 100644 --- a/xmrstak/net/jpsock.hpp +++ b/xmrstak/net/jpsock.hpp @@ -1,15 +1,14 @@ #pragma once -#include "xmrstak/backend/iBackend.hpp" #include "msgstruct.hpp" +#include "xmrstak/backend/iBackend.hpp" #include "xmrstak/jconf.hpp" -#include #include #include -#include +#include #include - +#include /* Our pool can have two kinds of errors: - Parsing or connection error @@ -27,7 +26,7 @@ class base_socket; class jpsock { -public: + public: jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash); ~jpsock(); @@ -55,7 +54,12 @@ class jpsock inline bool is_logged_in() { return bLoggedIn; } inline bool is_dev_pool() { return pool; } inline size_t get_pool_id() { return pool_id; } - inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; } + inline bool get_disconnects(size_t& att, size_t& time) + { + att = connect_attempts; + time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; + return pool && usr_login[0]; + } inline const char* get_pool_addr() { return net_addr.c_str(); } inline const char* get_tls_fp() { return tls_fp.c_str(); } inline const char* get_rigid() { return usr_rigid.c_str(); } @@ -77,7 +81,7 @@ class jpsock bool set_socket_error_strerr(const char* a); bool set_socket_error_strerr(const char* a, int res); -private: + private: std::string net_addr; std::string usr_login; std::string usr_rigid; @@ -142,4 +146,3 @@ class jpsock uint64_t iMessageCnt = 0; uint64_t iLastMessageId = 0; }; - diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp index 33980bf42..3cfce3c6f 100644 --- a/xmrstak/net/msgstruct.hpp +++ b/xmrstak/net/msgstruct.hpp @@ -2,25 +2,29 @@ #include "xmrstak/backend/cryptonight.hpp" -#include -#include #include +#include +#include // Structures that we use to pass info between threads constructors are here just to make // the stack allocation take up less space, heap is a shared resource that needs locks too of course struct pool_job { - char sJobID[64]; - uint8_t bWorkBlob[128]; - uint64_t iTarget; - uint32_t iWorkLen; - uint32_t iSavedNonce; - uint64_t iBlockHeight = uint64_t(-1); - - pool_job() : iWorkLen(0), iSavedNonce(0) {} + char sJobID[64]; + uint8_t bWorkBlob[128]; + uint64_t iTarget; + uint32_t iWorkLen; + uint32_t iSavedNonce; + uint64_t iBlockHeight = uint64_t(-1); + + pool_job() : + iWorkLen(0), + iSavedNonce(0) {} pool_job(const char* sJobID, uint64_t iTarget, const uint8_t* bWorkBlob, uint32_t iWorkLen) : - iTarget(iTarget), iWorkLen(iWorkLen), iSavedNonce(0) + iTarget(iTarget), + iWorkLen(iWorkLen), + iSavedNonce(0) { assert(iWorkLen <= sizeof(pool_job::bWorkBlob)); memcpy(this->sJobID, sJobID, sizeof(pool_job::sJobID)); @@ -30,15 +34,17 @@ struct pool_job struct job_result { - uint8_t bResult[32]; - char sJobID[64]; - uint32_t iNonce; - uint32_t iThreadId; + uint8_t bResult[32]; + char sJobID[64]; + uint32_t iNonce; + uint32_t iThreadId; xmrstak_algo algorithm = {invalid_algo}; job_result() {} job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, const xmrstak_algo& algo) : - iNonce(iNonce), iThreadId(iThreadId), algorithm(algo) + iNonce(iNonce), + iThreadId(iThreadId), + algorithm(algo) { memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID)); memcpy(this->bResult, bResult, sizeof(job_result::bResult)); @@ -51,8 +57,12 @@ struct sock_err bool silent; sock_err() {} - sock_err(std::string&& err, bool silent) : sSocketError(std::move(err)), silent(silent) { } - sock_err(sock_err&& from) : sSocketError(std::move(from.sSocketError)), silent(from.silent) {} + sock_err(std::string&& err, bool silent) : + sSocketError(std::move(err)), + silent(silent) {} + sock_err(sock_err&& from) : + sSocketError(std::move(from.sSocketError)), + silent(from.silent) {} sock_err& operator=(sock_err&& from) { @@ -62,7 +72,7 @@ struct sock_err return *this; } - ~sock_err() { } + ~sock_err() {} sock_err(sock_err const&) = delete; sock_err& operator=(sock_err const&) = delete; @@ -73,13 +83,30 @@ struct gpu_res_err { size_t idx; // GPU index const char* error_str; - gpu_res_err(const char* error_str, size_t idx) : error_str(error_str), idx(idx) {} + gpu_res_err(const char* error_str, size_t idx) : + error_str(error_str), + idx(idx) {} }; -enum ex_event_name { EV_INVALID_VAL, EV_SOCK_READY, EV_SOCK_ERROR, EV_GPU_RES_ERROR, - EV_POOL_HAVE_JOB, EV_MINER_HAVE_RESULT, EV_PERF_TICK, EV_EVAL_POOL_CHOICE, - EV_USR_HASHRATE, EV_USR_RESULTS, EV_USR_CONNSTAT, EV_HASHRATE_LOOP, - EV_HTML_HASHRATE, EV_HTML_RESULTS, EV_HTML_CONNSTAT, EV_HTML_JSON }; +enum ex_event_name +{ + EV_INVALID_VAL, + EV_SOCK_READY, + EV_SOCK_ERROR, + EV_GPU_RES_ERROR, + EV_POOL_HAVE_JOB, + EV_MINER_HAVE_RESULT, + EV_PERF_TICK, + EV_EVAL_POOL_CHOICE, + EV_USR_HASHRATE, + EV_USR_RESULTS, + EV_USR_CONNSTAT, + EV_HASHRATE_LOOP, + EV_HTML_HASHRATE, + EV_HTML_RESULTS, + EV_HTML_CONNSTAT, + EV_HTML_JSON +}; /* This is how I learned to stop worrying and love c++11 =). @@ -96,20 +123,37 @@ struct ex_event ex_event_name iName; size_t iPoolId; - union - { + union { pool_job oPoolJob; job_result oJobResult; sock_err oSocketError; gpu_res_err oGpuError; }; - ex_event() { iName = EV_INVALID_VAL; iPoolId = 0;} - ex_event(const char* gpu_err, size_t gpu_idx, size_t id) : iName(EV_GPU_RES_ERROR), iPoolId(id), oGpuError(gpu_err, gpu_idx) {} - ex_event(std::string&& err, bool silent, size_t id) : iName(EV_SOCK_ERROR), iPoolId(id), oSocketError(std::move(err), silent) { } - ex_event(job_result dat, size_t id) : iName(EV_MINER_HAVE_RESULT), iPoolId(id), oJobResult(dat) {} - ex_event(pool_job dat, size_t id) : iName(EV_POOL_HAVE_JOB), iPoolId(id), oPoolJob(dat) {} - ex_event(ex_event_name ev, size_t id = 0) : iName(ev), iPoolId(id) {} + ex_event() + { + iName = EV_INVALID_VAL; + iPoolId = 0; + } + ex_event(const char* gpu_err, size_t gpu_idx, size_t id) : + iName(EV_GPU_RES_ERROR), + iPoolId(id), + oGpuError(gpu_err, gpu_idx) {} + ex_event(std::string&& err, bool silent, size_t id) : + iName(EV_SOCK_ERROR), + iPoolId(id), + oSocketError(std::move(err), silent) {} + ex_event(job_result dat, size_t id) : + iName(EV_MINER_HAVE_RESULT), + iPoolId(id), + oJobResult(dat) {} + ex_event(pool_job dat, size_t id) : + iName(EV_POOL_HAVE_JOB), + iPoolId(id), + oPoolJob(dat) {} + ex_event(ex_event_name ev, size_t id = 0) : + iName(ev), + iPoolId(id) {} // Delete the copy operators to make sure we are moving only what is needed ex_event(ex_event const&) = delete; @@ -123,7 +167,7 @@ struct ex_event switch(iName) { case EV_SOCK_ERROR: - new (&oSocketError) sock_err(std::move(from.oSocketError)); + new(&oSocketError) sock_err(std::move(from.oSocketError)); break; case EV_MINER_HAVE_RESULT: oJobResult = from.oJobResult; @@ -151,7 +195,7 @@ struct ex_event switch(iName) { case EV_SOCK_ERROR: - new (&oSocketError) sock_err(); + new(&oSocketError) sock_err(); oSocketError = std::move(from.oSocketError); break; case EV_MINER_HAVE_RESULT: diff --git a/xmrstak/net/socket.cpp b/xmrstak/net/socket.cpp index 6fcb454cd..6a6abac15 100644 --- a/xmrstak/net/socket.cpp +++ b/xmrstak/net/socket.cpp @@ -28,16 +28,17 @@ #include "xmrstak/misc/executor.hpp" #ifndef CONF_NO_TLS -#include #include #include +#include #ifndef OPENSSL_THREADS #error OpenSSL was compiled without thread support #endif #endif -plain_socket::plain_socket(jpsock* err_callback) : pCallback(err_callback) +plain_socket::plain_socket(jpsock* err_callback) : + pCallback(err_callback) { hSocket = INVALID_SOCKET; pSockAddr = nullptr; @@ -50,58 +51,58 @@ bool plain_socket::set_hostname(const char* sAddr) sock_closed = false; size_t ln = strlen(sAddr); - if (ln >= sizeof(sAddrMb)) + if(ln >= sizeof(sAddrMb)) return pCallback->set_socket_error("CONNECT error: Pool address overflow."); memcpy(sAddrMb, sAddr, ln); sAddrMb[ln] = '\0'; - if ((sTmp = strstr(sAddrMb, "//")) != nullptr) + if((sTmp = strstr(sAddrMb, "//")) != nullptr) { sTmp += 2; memmove(sAddrMb, sTmp, strlen(sTmp) + 1); } - if ((sPort = strchr(sAddrMb, ':')) == nullptr) + if((sPort = strchr(sAddrMb, ':')) == nullptr) return pCallback->set_socket_error("CONNECT error: Pool port number not specified, please use format :."); sPort[0] = '\0'; sPort++; - addrinfo hints = { 0 }; + addrinfo hints = {0}; hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = IPPROTO_TCP; pAddrRoot = nullptr; int err; - if ((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0) + if((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0) return pCallback->set_socket_error_strerr("CONNECT error: GetAddrInfo: ", err); - addrinfo *ptr = pAddrRoot; + addrinfo* ptr = pAddrRoot; std::vector ipv4; std::vector ipv6; - while (ptr != nullptr) + while(ptr != nullptr) { - if (ptr->ai_family == AF_INET) + if(ptr->ai_family == AF_INET) ipv4.push_back(ptr); - if (ptr->ai_family == AF_INET6) + if(ptr->ai_family == AF_INET6) ipv6.push_back(ptr); ptr = ptr->ai_next; } - if (ipv4.empty() && ipv6.empty()) + if(ipv4.empty() && ipv6.empty()) { freeaddrinfo(pAddrRoot); pAddrRoot = nullptr; return pCallback->set_socket_error("CONNECT error: I found some DNS records but no IPv4 or IPv6 addresses."); } - else if (!ipv4.empty() && ipv6.empty()) + else if(!ipv4.empty() && ipv6.empty()) pSockAddr = ipv4[rand() % ipv4.size()]; - else if (ipv4.empty() && !ipv6.empty()) + else if(ipv4.empty() && !ipv6.empty()) pSockAddr = ipv6[rand() % ipv6.size()]; - else if (!ipv4.empty() && !ipv6.empty()) + else if(!ipv4.empty() && !ipv6.empty()) { if(jconf::inst()->PreferIpv4()) pSockAddr = ipv4[rand() % ipv4.size()]; @@ -111,7 +112,7 @@ bool plain_socket::set_hostname(const char* sAddr) hSocket = socket(pSockAddr->ai_family, pSockAddr->ai_socktype, pSockAddr->ai_protocol); - if (hSocket == INVALID_SOCKET) + if(hSocket == INVALID_SOCKET) { freeaddrinfo(pAddrRoot); pAddrRoot = nullptr; @@ -120,7 +121,7 @@ bool plain_socket::set_hostname(const char* sAddr) int flag = 1; /* If it fails, it fails, we won't loose too much sleep over it */ - setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int)); return true; } @@ -133,7 +134,7 @@ bool plain_socket::connect() freeaddrinfo(pAddrRoot); pAddrRoot = nullptr; - if (ret != 0) + if(ret != 0) return pCallback->set_socket_error_strerr("CONNECT error: "); else return true; @@ -158,10 +159,10 @@ bool plain_socket::send(const char* buf) { size_t pos = 0; size_t slen = strlen(buf); - while (pos != slen) + while(pos != slen) { int ret = ::send(hSocket, buf + pos, slen - pos, 0); - if (ret == SOCKET_ERROR) + if(ret == SOCKET_ERROR) { pCallback->set_socket_error_strerr("SEND error: "); return false; @@ -184,7 +185,8 @@ void plain_socket::close(bool free) } #ifndef CONF_NO_TLS -tls_socket::tls_socket(jpsock* err_callback) : pCallback(err_callback) +tls_socket::tls_socket(jpsock* err_callback) : + pCallback(err_callback) { } @@ -193,7 +195,7 @@ void tls_socket::print_error() BIO* err_bio = BIO_new(BIO_s_mem()); ERR_print_errors(err_bio); - char *buf = nullptr; + char* buf = nullptr; size_t len = BIO_get_mem_data(err_bio, &buf); if(buf == nullptr) @@ -247,7 +249,7 @@ bool tls_socket::set_hostname(const char* sAddr) int flag = 1; /* If it fails, it fails, we won't loose too much sleep over it */ - setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int)); if(BIO_set_conn_hostname(bio, sAddr) != 1) { @@ -327,7 +329,7 @@ bool tls_socket::connect() BIO_flush(b64); const char* conf_md = pCallback->get_tls_fp(); - char *b64_md = nullptr; + char* b64_md = nullptr; size_t b64_len = BIO_get_mem_data(bmem, &b64_md); if(strlen(conf_md) == 0) @@ -393,4 +395,3 @@ void tls_socket::close(bool free) } } #endif - diff --git a/xmrstak/net/socket.hpp b/xmrstak/net/socket.hpp index b09142d56..88b665adf 100644 --- a/xmrstak/net/socket.hpp +++ b/xmrstak/net/socket.hpp @@ -1,26 +1,26 @@ #pragma once -#include #include "socks.hpp" +#include class jpsock; class base_socket { -public: + public: virtual bool set_hostname(const char* sAddr) = 0; virtual bool connect() = 0; virtual int recv(char* buf, unsigned int len) = 0; virtual bool send(const char* buf) = 0; virtual void close(bool free) = 0; -protected: + protected: std::atomic sock_closed; }; class plain_socket : public base_socket { -public: + public: plain_socket(jpsock* err_callback); bool set_hostname(const char* sAddr); @@ -29,10 +29,10 @@ class plain_socket : public base_socket bool send(const char* buf); void close(bool free); -private: + private: jpsock* pCallback; - addrinfo *pSockAddr; - addrinfo *pAddrRoot; + addrinfo* pSockAddr; + addrinfo* pAddrRoot; SOCKET hSocket; }; @@ -42,7 +42,7 @@ typedef struct ssl_st SSL; class tls_socket : public base_socket { -public: + public: tls_socket(jpsock* err_callback); bool set_hostname(const char* sAddr); @@ -51,7 +51,7 @@ class tls_socket : public base_socket bool send(const char* buf); void close(bool free); -private: + private: void init_ctx(); void print_error(); diff --git a/xmrstak/net/socks.hpp b/xmrstak/net/socks.hpp index 86749e527..600e4d276 100644 --- a/xmrstak/net/socks.hpp +++ b/xmrstak/net/socks.hpp @@ -2,18 +2,19 @@ #ifdef _WIN32 #ifndef _WIN32_WINNT -#define _WIN32_WINNT 0x0601 /* Windows 7 */ +#define _WIN32_WINNT 0x0601 /* Windows 7 */ #endif + #include #include +// this comment disable clang include reordering for windows.h #include - inline void sock_init() { static bool bWSAInit = false; - if (!bWSAInit) + if(!bWSAInit) { WSADATA wsaData; WSAStartup(MAKEWORD(2, 2), &wsaData); @@ -56,20 +57,20 @@ inline const char* sock_gai_strerror(int err, char* buf, size_t len) #else /* Assume that any non-Windows platform uses POSIX-style sockets instead. */ -#include #include -#include /* Needed for getaddrinfo() and freeaddrinfo() */ -#include /* Needed for close() */ #include -#include +#include /* Needed for getaddrinfo() and freeaddrinfo() */ #include /* Needed for IPPROTO_TCP */ #include +#include +#include +#include /* Needed for close() */ inline void sock_init() {} typedef int SOCKET; -#define INVALID_SOCKET (-1) -#define SOCKET_ERROR (-1) +#define INVALID_SOCKET (-1) +#define SOCKET_ERROR (-1) inline void sock_close(SOCKET s) { diff --git a/xmrstak/params.hpp b/xmrstak/params.hpp index 146aaa42f..5bfbac381 100644 --- a/xmrstak/params.hpp +++ b/xmrstak/params.hpp @@ -15,7 +15,11 @@ struct params { auto& env = environment::inst(); if(env.pParams == nullptr) - env.pParams = new params; + { + std::unique_lock lck(env.update); + if(env.pParams == nullptr) + env.pParams = new params; + } return *env.pParams; } @@ -25,6 +29,8 @@ struct params bool AMDCache; bool useNVIDIA; bool useCPU; + std::string amdGpus; + std::string nvidiaGpus; // user selected OpenCL vendor std::string openCLVendor; @@ -50,6 +56,9 @@ struct params std::string configFileNVIDIA; std::string configFileCPU; + std::string outputFile; + int h_print_time = -1; + bool allowUAC = true; std::string minerArg0; std::string minerArgs; @@ -73,8 +82,8 @@ struct params rootAMDCacheDir(get_home() + "/.openclcache/"), configFileCPU("cpu.txt"), configFileNVIDIA("nvidia.txt") - {} - + { + } }; } // namespace xmrstak diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index ea3a276aa..eb57a3f04 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -19,19 +19,17 @@ POOLCONF], /* * Currency to mine. Supported values: * - * aeon7 (use this for Aeon's new PoW) * bbscoin (automatic switch with block version 3 to cryptonight_v7) * bittube (uses cryptonight_bittube2 algorithm) - * freehaven * graft * haven (automatic switch with block version 3 to cryptonight_haven) - * intense + * lethean * masari - * monero (use this to support Monero's Oct 2018 fork) * qrl - Quantum Resistant Ledger * ryo * turtlecoin * plenteum + * torque * xcash * * Native algorithms which do not depend on any block versions: @@ -49,7 +47,7 @@ POOLCONF], * cryptonight_v7 * cryptonight_v8 * cryptonight_v8_double (used by xcash) - * cryptonight_v8_half (used by masari and stellite) + * cryptonight_v8_half (used by masari and torque) * cryptonight_v8_reversewaltz (used by graft) * cryptonight_v8_zelerius * # 4MiB scratchpad memory diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index c9fa175ac..51c4e4e63 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -2,7 +2,9 @@ //! git will put "#define GIT_ARCHIVE 1" on the next line inside archives. $Format:%n#define GIT_ARCHIVE 1$ #if defined(GIT_ARCHIVE) && !defined(GIT_COMMIT_HASH) -#define GIT_COMMIT_HASH $Format:%h$ +#define GIT_COMMIT_HASH \ + $Format: \ + % h$ #endif #ifndef GIT_COMMIT_HASH @@ -18,7 +20,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.10.2" +#define XMR_STAK_VERSION "2.10.8" #if defined(_WIN32) #define OS_TYPE "win" @@ -35,10 +37,10 @@ #define XMRSTAK_PP_TOSTRING1(str) #str #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str) -#define VERSION_LONG XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/" +#define VERSION_LONG XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/" #define VERSION_SHORT XMR_STAK_NAME " " XMR_STAK_VERSION " " XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) #define VERSION_HTML "v" XMR_STAK_VERSION "-" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) -const char ver_long[] = VERSION_LONG; +const char ver_long[] = VERSION_LONG; const char ver_short[] = VERSION_SHORT; const char ver_html[] = VERSION_HTML; diff --git a/xmrstak/version.hpp b/xmrstak/version.hpp index cdf82f30d..85905f01c 100644 --- a/xmrstak/version.hpp +++ b/xmrstak/version.hpp @@ -1,8 +1,8 @@ #pragma once +#include "donate-level.hpp" #include #include -#include "donate-level.hpp" extern const char ver_long[]; extern const char ver_short[]; @@ -10,7 +10,7 @@ extern const char ver_html[]; inline std::string get_version_str() { - return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000)) ; + return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000)); } inline std::string get_version_str_short()
Error text
CountLast seen
Error text
CountLast seen
%s
%llu%s