From 95f7a6b335bc911999e9bc6975b4857998503a19 Mon Sep 17 00:00:00 2001 From: Timothy Baldridge Date: Sun, 24 Jul 2022 21:47:33 -0600 Subject: [PATCH] Some failed attempts and seeing if we could make hashing go faster on the GPU --- .../Program.cs | 28 ++- ...abbajack.Hashing.xxHash64.Benchmark.csproj | 7 +- Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs | 181 ++++++++++++++++++ .../AlgorithmState.cs | 40 ++++ .../Wabbajack.Hashing.xxHash64.GPU.csproj | 13 ++ Wabbajack.Hashing.xxHash64.Test/GPUTests.cs | 39 ++++ .../Wabbajack.Hashing.xxHash64.Test.csproj | 1 + Wabbajack.sln | 7 + 8 files changed, 311 insertions(+), 5 deletions(-) create mode 100644 Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs create mode 100644 Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs create mode 100644 Wabbajack.Hashing.xxHash64.GPU/Wabbajack.Hashing.xxHash64.GPU.csproj create mode 100644 Wabbajack.Hashing.xxHash64.Test/GPUTests.cs diff --git a/Wabbajack.Hashing.xxHash64.Benchmark/Program.cs b/Wabbajack.Hashing.xxHash64.Benchmark/Program.cs index 9fa0c598..cdc48740 100644 --- a/Wabbajack.Hashing.xxHash64.Benchmark/Program.cs +++ b/Wabbajack.Hashing.xxHash64.Benchmark/Program.cs @@ -2,6 +2,11 @@ using System.Data.HashFunction.xxHash; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Running; +using ILGPU; +using ILGPU.Runtime; +using ILGPU.Runtime.Cuda; +using ILGPU.Runtime.OpenCL; +using Wabbajack.Hashing.xxHash64.GPU; namespace Wabbajack.Hashing.xxHash64.Benchmark; @@ -9,7 +14,7 @@ internal class Program { private static void Main(string[] args) { - BenchmarkRunner.Run(); + BenchmarkRunner.Run(); } } @@ -18,10 +23,17 @@ internal class Program public class xxHashBenchmark { private readonly byte[] _data; + private readonly Context _context; + private readonly Accelerator _gpu; + private readonly Accelerator _cpu; public xxHashBenchmark() { - _data = new byte[1024 * 1024]; + _data = new byte[1024 * 1024 * 1024]; + + _context = Context.CreateDefault(); + _gpu = _context.GetPreferredDevice(false).CreateAccelerator(_context); + _cpu = _context.GetPreferredDevice(true).CreateAccelerator(_context); } [Benchmark] @@ -37,6 +49,18 @@ public class xxHashBenchmark var config = new xxHashConfig {HashSizeInBits = 64}; BitConverter.ToUInt64(xxHashFactory.Instance.Create(config).ComputeHash(_data).Hash); } + + [Benchmark] + public void GPUCode() + { + Algorithm.HashBytes(_gpu, _data); + } + + [Benchmark] + public void CPUCode() + { + Algorithm.HashBytes(_cpu, _data); + } } [MemoryDiagnoser] diff --git a/Wabbajack.Hashing.xxHash64.Benchmark/Wabbajack.Hashing.xxHash64.Benchmark.csproj b/Wabbajack.Hashing.xxHash64.Benchmark/Wabbajack.Hashing.xxHash64.Benchmark.csproj index 9d9e5074..b9e64a7b 100644 --- a/Wabbajack.Hashing.xxHash64.Benchmark/Wabbajack.Hashing.xxHash64.Benchmark.csproj +++ b/Wabbajack.Hashing.xxHash64.Benchmark/Wabbajack.Hashing.xxHash64.Benchmark.csproj @@ -8,12 +8,13 @@ - - + + - + + diff --git a/Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs b/Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs new file mode 100644 index 00000000..3f89297b --- /dev/null +++ b/Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs @@ -0,0 +1,181 @@ +using ILGPU; +using ILGPU.Runtime; +using ILGPU.Util; + +namespace Wabbajack.Hashing.xxHash64.GPU; + +public class Algorithm +{ + private static readonly ulong[] Primes64 = + { + 11400714785074694791UL, + 14029467366897019727UL, + 1609587929392839161UL, + 9650029242287828579UL, + 2870177450012600261UL + }; + + private const ulong Prime0 = 11400714785074694791UL; + private const ulong Prime1 = 14029467366897019727UL; + private const ulong Prime2 = 1609587929392839161UL; + private const ulong Prime3 = 9650029242287828579UL; + private const ulong Prime4 = 2870177450012600261UL; + + private const ulong Seed = 0L; + + public ulong HashBytes(byte[] data) + { + return HashBytes(Accelerator.Current, data); + } + + public static ulong HashBytes(Accelerator accelerator, byte[] data) + { + var initialSize = (data.Length >> 5) << 5; + + var gpuData = accelerator.Allocate1D(initialSize); + gpuData.CopyFromCPU(data); + + ulong seed = 0; + + var state = accelerator.Allocate1D(4); + var tmpState = new ulong[4]; + tmpState[0] = seed + Primes64[0] + Primes64[1]; + tmpState[1] = seed + Primes64[1]; + tmpState[2] = seed; + tmpState[3] = seed - Primes64[0]; + state.CopyFromCPU(tmpState); + + if (initialSize > 0) + { + var transformKernal = accelerator.LoadAutoGroupedStreamKernel, ArrayView, int>(TransformByteGroupsInternal); + transformKernal(new Index1D(4), state.View, gpuData.View, initialSize); + } + + var cpuData = new ulong[4]; + state.View.CopyToCPU(cpuData); + return FinalizeHashValueInternal(cpuData, data.AsSpan(initialSize..), (ulong)initialSize); + } + + private static void TransformByteGroupsInternal(Index1D index, ArrayView state, ArrayView dataIn, int size) + { + var data = dataIn.Cast(); + var temp = state[index.X]; + + var tempPrime0 = Prime0; + var tempPrime1 = Prime1; + + for (var idx = index.X; idx < data.Length; idx += 4) + { + temp += data[idx] * tempPrime1; + temp = RotateLeft(temp, 31); + temp *= tempPrime0; + } + + state[index.X] = temp; + } + + private static ulong FinalizeHashValueInternal(ulong[] hashState, ReadOnlySpan data, ulong bytesProcessed) + { + ulong hashValue; + { + if (bytesProcessed > 0) + { + var tempA = hashState[0]; + var tempB = hashState[1]; + var tempC = hashState[2]; + var tempD = hashState[3]; + + + hashValue = RotateLeft(tempA, 1) + RotateLeft(tempB, 7) + RotateLeft(tempC, 12) + RotateLeft(tempD, 18); + + // A + tempA *= Primes64[1]; + tempA = RotateLeft(tempA, 31); + tempA *= Primes64[0]; + + hashValue ^= tempA; + hashValue = hashValue * Primes64[0] + Primes64[3]; + + // B + tempB *= Primes64[1]; + tempB = RotateLeft(tempB, 31); + tempB *= Primes64[0]; + + hashValue ^= tempB; + hashValue = hashValue * Primes64[0] + Primes64[3]; + + // C + tempC *= Primes64[1]; + tempC = RotateLeft(tempC, 31); + tempC *= Primes64[0]; + + hashValue ^= tempC; + hashValue = hashValue * Primes64[0] + Primes64[3]; + + // D + tempD *= Primes64[1]; + tempD = RotateLeft(tempD, 31); + tempD *= Primes64[0]; + + hashValue ^= tempD; + hashValue = hashValue * Primes64[0] + Primes64[3]; + } + else + { + hashValue = Seed + Primes64[4]; + } + } + + var remainderLength = data.Length; + + hashValue += bytesProcessed + (ulong) remainderLength; + + if (remainderLength > 0) + { + // In 8-byte chunks, process all full chunks + for (var x = 0; x < data.Length / 8; ++x) + { + hashValue ^= RotateLeft(BitConverter.ToUInt64(data[(x * 8)..]) * Primes64[1], 31) * Primes64[0]; + hashValue = RotateLeft(hashValue, 27) * Primes64[0] + Primes64[3]; + } + + // Process a 4-byte chunk if it exists + if (remainderLength % 8 >= 4) + { + var startOffset = remainderLength - remainderLength % 8; + + hashValue ^= BitConverter.ToUInt32(data[startOffset..]) * Primes64[0]; + hashValue = RotateLeft(hashValue, 23) * Primes64[1] + Primes64[2]; + } + + // Process last 4 bytes in 1-byte chunks (only runs if data.Length % 4 != 0) + { + var startOffset = remainderLength - remainderLength % 4; + var endOffset = remainderLength; + + for (var currentOffset = startOffset; currentOffset < endOffset; currentOffset += 1) + { + hashValue ^= data[currentOffset] * Primes64[4]; + hashValue = RotateLeft(hashValue, 11) * Primes64[0]; + } + } + } + + hashValue ^= hashValue >> 33; + hashValue *= Primes64[1]; + hashValue ^= hashValue >> 29; + hashValue *= Primes64[2]; + hashValue ^= hashValue >> 32; + + return hashValue; + } + + private static ulong RotateLeft(ulong operand, int shiftCount) + { + shiftCount &= 0x3f; + + return + (operand << shiftCount) | + (operand >> (64 - shiftCount)); + } +} \ No newline at end of file diff --git a/Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs b/Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs new file mode 100644 index 00000000..368adc10 --- /dev/null +++ b/Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs @@ -0,0 +1,40 @@ +namespace Wabbajack.Hashing.xxHash64.GPU; + +public struct AlgorithmState +{ + private static readonly IReadOnlyList Primes64 = + new[] + { + 11400714785074694791UL, + 14029467366897019727UL, + 1609587929392839161UL, + 9650029242287828579UL, + 2870177450012600261UL + }; + + private const ulong Prime0 = 11400714785074694791UL; + private const ulong Prime1 = 14029467366897019727UL; + private const ulong Prime2 = 1609587929392839161UL; + private const ulong Prime3 = 9650029242287828579UL; + private const ulong Prime4 = 2870177450012600261UL; + + + internal readonly ulong Seed; + + internal ulong A; + internal ulong B; + internal ulong C; + internal ulong D; + + internal ulong BytesProcessed; + + public AlgorithmState(ulong seed) + { + Seed = seed; + A = Seed + Primes64[0] + Primes64[1]; + B = Seed + Primes64[1]; + C = Seed; + D = Seed - Primes64[0]; + BytesProcessed = 0; + } +} \ No newline at end of file diff --git a/Wabbajack.Hashing.xxHash64.GPU/Wabbajack.Hashing.xxHash64.GPU.csproj b/Wabbajack.Hashing.xxHash64.GPU/Wabbajack.Hashing.xxHash64.GPU.csproj new file mode 100644 index 00000000..73f20ea0 --- /dev/null +++ b/Wabbajack.Hashing.xxHash64.GPU/Wabbajack.Hashing.xxHash64.GPU.csproj @@ -0,0 +1,13 @@ + + + + net6.0 + enable + enable + + + + + + + diff --git a/Wabbajack.Hashing.xxHash64.Test/GPUTests.cs b/Wabbajack.Hashing.xxHash64.Test/GPUTests.cs new file mode 100644 index 00000000..189b0dff --- /dev/null +++ b/Wabbajack.Hashing.xxHash64.Test/GPUTests.cs @@ -0,0 +1,39 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using ILGPU; +using ILGPU.Runtime; +using Wabbajack.Hashing.xxHash64.GPU; +using Xunit; + +namespace Wabbajack.Hashing.xxHash64.Test; + +public class GPUTests +{ + static GPUTests() + { + CurrentContext = Context.Create(b => b.Default().StaticFields(StaticFieldMode.MutableStaticFields | StaticFieldMode.IgnoreStaticFieldStores)); + } + + [Theory] + [MemberData(nameof(Accelerators))] + public void CanHashData(Accelerator acc, byte[] data) + { + var result = Algorithm.HashBytes(acc, data); + Assert.Equal(Hash.FromBase64("vBY6OyblpIw="), Hash.FromULong(result)); + } + + + public static Context CurrentContext { get; set; } + + + public static IEnumerable Accelerators() + { + var random = new Random(42); + var data = new byte[1024 * 1024 * 1024]; + random.NextBytes(data); + return CurrentContext.Devices.Select(c => { return new object[] {c.CreateAccelerator(CurrentContext), data}; }); + } +} \ No newline at end of file diff --git a/Wabbajack.Hashing.xxHash64.Test/Wabbajack.Hashing.xxHash64.Test.csproj b/Wabbajack.Hashing.xxHash64.Test/Wabbajack.Hashing.xxHash64.Test.csproj index d474e30b..bd87e3e4 100644 --- a/Wabbajack.Hashing.xxHash64.Test/Wabbajack.Hashing.xxHash64.Test.csproj +++ b/Wabbajack.Hashing.xxHash64.Test/Wabbajack.Hashing.xxHash64.Test.csproj @@ -23,6 +23,7 @@ + diff --git a/Wabbajack.sln b/Wabbajack.sln index a52c3c6f..397fdf7c 100644 --- a/Wabbajack.sln +++ b/Wabbajack.sln @@ -143,6 +143,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.App.Wpf", "Wabbaj EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.VFS.Interfaces", "Wabbajack.VFS.Interfaces\Wabbajack.VFS.Interfaces.csproj", "{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.Hashing.xxHash64.GPU", "Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj", "{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -393,6 +395,10 @@ Global {E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Debug|Any CPU.Build.0 = Debug|Any CPU {E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.ActiveCfg = Release|Any CPU {E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.Build.0 = Release|Any CPU + {7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -442,6 +448,7 @@ Global {B10BB6D6-B3FC-4A76-8A07-6A0A0ADDE198} = {98B731EE-4FC0-4482-A069-BCBA25497871} {7FC4F129-F0FA-46B7-B7C4-532E371A6326} = {98B731EE-4FC0-4482-A069-BCBA25497871} {E4BDB22D-11A4-452F-8D10-D9CA9777EA22} = {F677890D-5109-43BC-97C7-C4CD47C8EE0C} + {7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10} = {B953DCDB-6D18-483F-BC38-1E4B1D3E12B5} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {0AA30275-0F38-4A7D-B645-F5505178DDE8}