mirror of
https://github.com/wabbajack-tools/wabbajack.git
synced 2024-08-30 18:42:17 +00:00
Some failed attempts and seeing if we could make hashing go faster on the GPU
This commit is contained in:
parent
25041ab5b3
commit
95f7a6b335
@ -2,6 +2,11 @@
|
|||||||
using System.Data.HashFunction.xxHash;
|
using System.Data.HashFunction.xxHash;
|
||||||
using BenchmarkDotNet.Attributes;
|
using BenchmarkDotNet.Attributes;
|
||||||
using BenchmarkDotNet.Running;
|
using BenchmarkDotNet.Running;
|
||||||
|
using ILGPU;
|
||||||
|
using ILGPU.Runtime;
|
||||||
|
using ILGPU.Runtime.Cuda;
|
||||||
|
using ILGPU.Runtime.OpenCL;
|
||||||
|
using Wabbajack.Hashing.xxHash64.GPU;
|
||||||
|
|
||||||
namespace Wabbajack.Hashing.xxHash64.Benchmark;
|
namespace Wabbajack.Hashing.xxHash64.Benchmark;
|
||||||
|
|
||||||
@ -9,7 +14,7 @@ internal class Program
|
|||||||
{
|
{
|
||||||
private static void Main(string[] args)
|
private static void Main(string[] args)
|
||||||
{
|
{
|
||||||
BenchmarkRunner.Run<Base64EncoderBenchmark>();
|
BenchmarkRunner.Run<xxHashBenchmark>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -18,10 +23,17 @@ internal class Program
|
|||||||
public class xxHashBenchmark
|
public class xxHashBenchmark
|
||||||
{
|
{
|
||||||
private readonly byte[] _data;
|
private readonly byte[] _data;
|
||||||
|
private readonly Context _context;
|
||||||
|
private readonly Accelerator _gpu;
|
||||||
|
private readonly Accelerator _cpu;
|
||||||
|
|
||||||
public xxHashBenchmark()
|
public xxHashBenchmark()
|
||||||
{
|
{
|
||||||
_data = new byte[1024 * 1024];
|
_data = new byte[1024 * 1024 * 1024];
|
||||||
|
|
||||||
|
_context = Context.CreateDefault();
|
||||||
|
_gpu = _context.GetPreferredDevice(false).CreateAccelerator(_context);
|
||||||
|
_cpu = _context.GetPreferredDevice(true).CreateAccelerator(_context);
|
||||||
}
|
}
|
||||||
|
|
||||||
[Benchmark]
|
[Benchmark]
|
||||||
@ -37,6 +49,18 @@ public class xxHashBenchmark
|
|||||||
var config = new xxHashConfig {HashSizeInBits = 64};
|
var config = new xxHashConfig {HashSizeInBits = 64};
|
||||||
BitConverter.ToUInt64(xxHashFactory.Instance.Create(config).ComputeHash(_data).Hash);
|
BitConverter.ToUInt64(xxHashFactory.Instance.Create(config).ComputeHash(_data).Hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Benchmark]
|
||||||
|
public void GPUCode()
|
||||||
|
{
|
||||||
|
Algorithm.HashBytes(_gpu, _data);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Benchmark]
|
||||||
|
public void CPUCode()
|
||||||
|
{
|
||||||
|
Algorithm.HashBytes(_cpu, _data);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
[MemoryDiagnoser]
|
[MemoryDiagnoser]
|
||||||
|
@ -8,12 +8,13 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="BenchmarkDotNet" Version="0.13.1"/>
|
<PackageReference Include="BenchmarkDotNet" Version="0.13.1" />
|
||||||
<PackageReference Include="System.Data.HashFunction.xxHash" Version="2.0.0"/>
|
<PackageReference Include="System.Data.HashFunction.xxHash" Version="2.0.0" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj"/>
|
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj" />
|
||||||
|
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
181
Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs
Normal file
181
Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
using ILGPU;
|
||||||
|
using ILGPU.Runtime;
|
||||||
|
using ILGPU.Util;
|
||||||
|
|
||||||
|
namespace Wabbajack.Hashing.xxHash64.GPU;
|
||||||
|
|
||||||
|
public class Algorithm
|
||||||
|
{
|
||||||
|
private static readonly ulong[] Primes64 =
|
||||||
|
{
|
||||||
|
11400714785074694791UL,
|
||||||
|
14029467366897019727UL,
|
||||||
|
1609587929392839161UL,
|
||||||
|
9650029242287828579UL,
|
||||||
|
2870177450012600261UL
|
||||||
|
};
|
||||||
|
|
||||||
|
private const ulong Prime0 = 11400714785074694791UL;
|
||||||
|
private const ulong Prime1 = 14029467366897019727UL;
|
||||||
|
private const ulong Prime2 = 1609587929392839161UL;
|
||||||
|
private const ulong Prime3 = 9650029242287828579UL;
|
||||||
|
private const ulong Prime4 = 2870177450012600261UL;
|
||||||
|
|
||||||
|
private const ulong Seed = 0L;
|
||||||
|
|
||||||
|
public ulong HashBytes(byte[] data)
|
||||||
|
{
|
||||||
|
return HashBytes(Accelerator.Current, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ulong HashBytes(Accelerator accelerator, byte[] data)
|
||||||
|
{
|
||||||
|
var initialSize = (data.Length >> 5) << 5;
|
||||||
|
|
||||||
|
var gpuData = accelerator.Allocate1D<byte>(initialSize);
|
||||||
|
gpuData.CopyFromCPU(data);
|
||||||
|
|
||||||
|
ulong seed = 0;
|
||||||
|
|
||||||
|
var state = accelerator.Allocate1D<ulong>(4);
|
||||||
|
var tmpState = new ulong[4];
|
||||||
|
tmpState[0] = seed + Primes64[0] + Primes64[1];
|
||||||
|
tmpState[1] = seed + Primes64[1];
|
||||||
|
tmpState[2] = seed;
|
||||||
|
tmpState[3] = seed - Primes64[0];
|
||||||
|
state.CopyFromCPU(tmpState);
|
||||||
|
|
||||||
|
if (initialSize > 0)
|
||||||
|
{
|
||||||
|
var transformKernal = accelerator.LoadAutoGroupedStreamKernel<Index1D, ArrayView<ulong>, ArrayView<byte>, int>(TransformByteGroupsInternal);
|
||||||
|
transformKernal(new Index1D(4), state.View, gpuData.View, initialSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
var cpuData = new ulong[4];
|
||||||
|
state.View.CopyToCPU(cpuData);
|
||||||
|
return FinalizeHashValueInternal(cpuData, data.AsSpan(initialSize..), (ulong)initialSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void TransformByteGroupsInternal(Index1D index, ArrayView<ulong> state, ArrayView<byte> dataIn, int size)
|
||||||
|
{
|
||||||
|
var data = dataIn.Cast<ulong>();
|
||||||
|
var temp = state[index.X];
|
||||||
|
|
||||||
|
var tempPrime0 = Prime0;
|
||||||
|
var tempPrime1 = Prime1;
|
||||||
|
|
||||||
|
for (var idx = index.X; idx < data.Length; idx += 4)
|
||||||
|
{
|
||||||
|
temp += data[idx] * tempPrime1;
|
||||||
|
temp = RotateLeft(temp, 31);
|
||||||
|
temp *= tempPrime0;
|
||||||
|
}
|
||||||
|
|
||||||
|
state[index.X] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ulong FinalizeHashValueInternal(ulong[] hashState, ReadOnlySpan<byte> data, ulong bytesProcessed)
|
||||||
|
{
|
||||||
|
ulong hashValue;
|
||||||
|
{
|
||||||
|
if (bytesProcessed > 0)
|
||||||
|
{
|
||||||
|
var tempA = hashState[0];
|
||||||
|
var tempB = hashState[1];
|
||||||
|
var tempC = hashState[2];
|
||||||
|
var tempD = hashState[3];
|
||||||
|
|
||||||
|
|
||||||
|
hashValue = RotateLeft(tempA, 1) + RotateLeft(tempB, 7) + RotateLeft(tempC, 12) + RotateLeft(tempD, 18);
|
||||||
|
|
||||||
|
// A
|
||||||
|
tempA *= Primes64[1];
|
||||||
|
tempA = RotateLeft(tempA, 31);
|
||||||
|
tempA *= Primes64[0];
|
||||||
|
|
||||||
|
hashValue ^= tempA;
|
||||||
|
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||||
|
|
||||||
|
// B
|
||||||
|
tempB *= Primes64[1];
|
||||||
|
tempB = RotateLeft(tempB, 31);
|
||||||
|
tempB *= Primes64[0];
|
||||||
|
|
||||||
|
hashValue ^= tempB;
|
||||||
|
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||||
|
|
||||||
|
// C
|
||||||
|
tempC *= Primes64[1];
|
||||||
|
tempC = RotateLeft(tempC, 31);
|
||||||
|
tempC *= Primes64[0];
|
||||||
|
|
||||||
|
hashValue ^= tempC;
|
||||||
|
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||||
|
|
||||||
|
// D
|
||||||
|
tempD *= Primes64[1];
|
||||||
|
tempD = RotateLeft(tempD, 31);
|
||||||
|
tempD *= Primes64[0];
|
||||||
|
|
||||||
|
hashValue ^= tempD;
|
||||||
|
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
hashValue = Seed + Primes64[4];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var remainderLength = data.Length;
|
||||||
|
|
||||||
|
hashValue += bytesProcessed + (ulong) remainderLength;
|
||||||
|
|
||||||
|
if (remainderLength > 0)
|
||||||
|
{
|
||||||
|
// In 8-byte chunks, process all full chunks
|
||||||
|
for (var x = 0; x < data.Length / 8; ++x)
|
||||||
|
{
|
||||||
|
hashValue ^= RotateLeft(BitConverter.ToUInt64(data[(x * 8)..]) * Primes64[1], 31) * Primes64[0];
|
||||||
|
hashValue = RotateLeft(hashValue, 27) * Primes64[0] + Primes64[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process a 4-byte chunk if it exists
|
||||||
|
if (remainderLength % 8 >= 4)
|
||||||
|
{
|
||||||
|
var startOffset = remainderLength - remainderLength % 8;
|
||||||
|
|
||||||
|
hashValue ^= BitConverter.ToUInt32(data[startOffset..]) * Primes64[0];
|
||||||
|
hashValue = RotateLeft(hashValue, 23) * Primes64[1] + Primes64[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process last 4 bytes in 1-byte chunks (only runs if data.Length % 4 != 0)
|
||||||
|
{
|
||||||
|
var startOffset = remainderLength - remainderLength % 4;
|
||||||
|
var endOffset = remainderLength;
|
||||||
|
|
||||||
|
for (var currentOffset = startOffset; currentOffset < endOffset; currentOffset += 1)
|
||||||
|
{
|
||||||
|
hashValue ^= data[currentOffset] * Primes64[4];
|
||||||
|
hashValue = RotateLeft(hashValue, 11) * Primes64[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hashValue ^= hashValue >> 33;
|
||||||
|
hashValue *= Primes64[1];
|
||||||
|
hashValue ^= hashValue >> 29;
|
||||||
|
hashValue *= Primes64[2];
|
||||||
|
hashValue ^= hashValue >> 32;
|
||||||
|
|
||||||
|
return hashValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ulong RotateLeft(ulong operand, int shiftCount)
|
||||||
|
{
|
||||||
|
shiftCount &= 0x3f;
|
||||||
|
|
||||||
|
return
|
||||||
|
(operand << shiftCount) |
|
||||||
|
(operand >> (64 - shiftCount));
|
||||||
|
}
|
||||||
|
}
|
40
Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs
Normal file
40
Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
namespace Wabbajack.Hashing.xxHash64.GPU;
|
||||||
|
|
||||||
|
public struct AlgorithmState
|
||||||
|
{
|
||||||
|
private static readonly IReadOnlyList<ulong> Primes64 =
|
||||||
|
new[]
|
||||||
|
{
|
||||||
|
11400714785074694791UL,
|
||||||
|
14029467366897019727UL,
|
||||||
|
1609587929392839161UL,
|
||||||
|
9650029242287828579UL,
|
||||||
|
2870177450012600261UL
|
||||||
|
};
|
||||||
|
|
||||||
|
private const ulong Prime0 = 11400714785074694791UL;
|
||||||
|
private const ulong Prime1 = 14029467366897019727UL;
|
||||||
|
private const ulong Prime2 = 1609587929392839161UL;
|
||||||
|
private const ulong Prime3 = 9650029242287828579UL;
|
||||||
|
private const ulong Prime4 = 2870177450012600261UL;
|
||||||
|
|
||||||
|
|
||||||
|
internal readonly ulong Seed;
|
||||||
|
|
||||||
|
internal ulong A;
|
||||||
|
internal ulong B;
|
||||||
|
internal ulong C;
|
||||||
|
internal ulong D;
|
||||||
|
|
||||||
|
internal ulong BytesProcessed;
|
||||||
|
|
||||||
|
public AlgorithmState(ulong seed)
|
||||||
|
{
|
||||||
|
Seed = seed;
|
||||||
|
A = Seed + Primes64[0] + Primes64[1];
|
||||||
|
B = Seed + Primes64[1];
|
||||||
|
C = Seed;
|
||||||
|
D = Seed - Primes64[0];
|
||||||
|
BytesProcessed = 0;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,13 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>net6.0</TargetFramework>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="ILGPU" Version="1.2.0" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
39
Wabbajack.Hashing.xxHash64.Test/GPUTests.cs
Normal file
39
Wabbajack.Hashing.xxHash64.Test/GPUTests.cs
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using ILGPU;
|
||||||
|
using ILGPU.Runtime;
|
||||||
|
using Wabbajack.Hashing.xxHash64.GPU;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
namespace Wabbajack.Hashing.xxHash64.Test;
|
||||||
|
|
||||||
|
public class GPUTests
|
||||||
|
{
|
||||||
|
static GPUTests()
|
||||||
|
{
|
||||||
|
CurrentContext = Context.Create(b => b.Default().StaticFields(StaticFieldMode.MutableStaticFields | StaticFieldMode.IgnoreStaticFieldStores));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[MemberData(nameof(Accelerators))]
|
||||||
|
public void CanHashData(Accelerator acc, byte[] data)
|
||||||
|
{
|
||||||
|
var result = Algorithm.HashBytes(acc, data);
|
||||||
|
Assert.Equal(Hash.FromBase64("vBY6OyblpIw="), Hash.FromULong(result));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Context CurrentContext { get; set; }
|
||||||
|
|
||||||
|
|
||||||
|
public static IEnumerable<object[]> Accelerators()
|
||||||
|
{
|
||||||
|
var random = new Random(42);
|
||||||
|
var data = new byte[1024 * 1024 * 1024];
|
||||||
|
random.NextBytes(data);
|
||||||
|
return CurrentContext.Devices.Select(c => { return new object[] {c.CreateAccelerator(CurrentContext), data}; });
|
||||||
|
}
|
||||||
|
}
|
@ -23,6 +23,7 @@
|
|||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj" />
|
||||||
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj" />
|
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
|
@ -143,6 +143,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.App.Wpf", "Wabbaj
|
|||||||
EndProject
|
EndProject
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.VFS.Interfaces", "Wabbajack.VFS.Interfaces\Wabbajack.VFS.Interfaces.csproj", "{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.VFS.Interfaces", "Wabbajack.VFS.Interfaces\Wabbajack.VFS.Interfaces.csproj", "{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}"
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.Hashing.xxHash64.GPU", "Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj", "{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
||||||
@ -393,6 +395,10 @@ Global
|
|||||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.Build.0 = Release|Any CPU
|
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
@ -442,6 +448,7 @@ Global
|
|||||||
{B10BB6D6-B3FC-4A76-8A07-6A0A0ADDE198} = {98B731EE-4FC0-4482-A069-BCBA25497871}
|
{B10BB6D6-B3FC-4A76-8A07-6A0A0ADDE198} = {98B731EE-4FC0-4482-A069-BCBA25497871}
|
||||||
{7FC4F129-F0FA-46B7-B7C4-532E371A6326} = {98B731EE-4FC0-4482-A069-BCBA25497871}
|
{7FC4F129-F0FA-46B7-B7C4-532E371A6326} = {98B731EE-4FC0-4482-A069-BCBA25497871}
|
||||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22} = {F677890D-5109-43BC-97C7-C4CD47C8EE0C}
|
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22} = {F677890D-5109-43BC-97C7-C4CD47C8EE0C}
|
||||||
|
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10} = {B953DCDB-6D18-483F-BC38-1E4B1D3E12B5}
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||||
SolutionGuid = {0AA30275-0F38-4A7D-B645-F5505178DDE8}
|
SolutionGuid = {0AA30275-0F38-4A7D-B645-F5505178DDE8}
|
||||||
|
Loading…
Reference in New Issue
Block a user