mirror of
https://github.com/wabbajack-tools/wabbajack.git
synced 2024-08-30 18:42:17 +00:00
Some failed attempts and seeing if we could make hashing go faster on the GPU
This commit is contained in:
parent
25041ab5b3
commit
95f7a6b335
@ -2,6 +2,11 @@
|
||||
using System.Data.HashFunction.xxHash;
|
||||
using BenchmarkDotNet.Attributes;
|
||||
using BenchmarkDotNet.Running;
|
||||
using ILGPU;
|
||||
using ILGPU.Runtime;
|
||||
using ILGPU.Runtime.Cuda;
|
||||
using ILGPU.Runtime.OpenCL;
|
||||
using Wabbajack.Hashing.xxHash64.GPU;
|
||||
|
||||
namespace Wabbajack.Hashing.xxHash64.Benchmark;
|
||||
|
||||
@ -9,7 +14,7 @@ internal class Program
|
||||
{
|
||||
private static void Main(string[] args)
|
||||
{
|
||||
BenchmarkRunner.Run<Base64EncoderBenchmark>();
|
||||
BenchmarkRunner.Run<xxHashBenchmark>();
|
||||
}
|
||||
}
|
||||
|
||||
@ -18,10 +23,17 @@ internal class Program
|
||||
public class xxHashBenchmark
|
||||
{
|
||||
private readonly byte[] _data;
|
||||
private readonly Context _context;
|
||||
private readonly Accelerator _gpu;
|
||||
private readonly Accelerator _cpu;
|
||||
|
||||
public xxHashBenchmark()
|
||||
{
|
||||
_data = new byte[1024 * 1024];
|
||||
_data = new byte[1024 * 1024 * 1024];
|
||||
|
||||
_context = Context.CreateDefault();
|
||||
_gpu = _context.GetPreferredDevice(false).CreateAccelerator(_context);
|
||||
_cpu = _context.GetPreferredDevice(true).CreateAccelerator(_context);
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
@ -37,6 +49,18 @@ public class xxHashBenchmark
|
||||
var config = new xxHashConfig {HashSizeInBits = 64};
|
||||
BitConverter.ToUInt64(xxHashFactory.Instance.Create(config).ComputeHash(_data).Hash);
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
public void GPUCode()
|
||||
{
|
||||
Algorithm.HashBytes(_gpu, _data);
|
||||
}
|
||||
|
||||
[Benchmark]
|
||||
public void CPUCode()
|
||||
{
|
||||
Algorithm.HashBytes(_cpu, _data);
|
||||
}
|
||||
}
|
||||
|
||||
[MemoryDiagnoser]
|
||||
|
@ -8,12 +8,13 @@
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="BenchmarkDotNet" Version="0.13.1"/>
|
||||
<PackageReference Include="System.Data.HashFunction.xxHash" Version="2.0.0"/>
|
||||
<PackageReference Include="BenchmarkDotNet" Version="0.13.1" />
|
||||
<PackageReference Include="System.Data.HashFunction.xxHash" Version="2.0.0" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj"/>
|
||||
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj" />
|
||||
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
181
Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs
Normal file
181
Wabbajack.Hashing.xxHash64.GPU/Algorithm.cs
Normal file
@ -0,0 +1,181 @@
|
||||
using ILGPU;
|
||||
using ILGPU.Runtime;
|
||||
using ILGPU.Util;
|
||||
|
||||
namespace Wabbajack.Hashing.xxHash64.GPU;
|
||||
|
||||
public class Algorithm
|
||||
{
|
||||
private static readonly ulong[] Primes64 =
|
||||
{
|
||||
11400714785074694791UL,
|
||||
14029467366897019727UL,
|
||||
1609587929392839161UL,
|
||||
9650029242287828579UL,
|
||||
2870177450012600261UL
|
||||
};
|
||||
|
||||
private const ulong Prime0 = 11400714785074694791UL;
|
||||
private const ulong Prime1 = 14029467366897019727UL;
|
||||
private const ulong Prime2 = 1609587929392839161UL;
|
||||
private const ulong Prime3 = 9650029242287828579UL;
|
||||
private const ulong Prime4 = 2870177450012600261UL;
|
||||
|
||||
private const ulong Seed = 0L;
|
||||
|
||||
public ulong HashBytes(byte[] data)
|
||||
{
|
||||
return HashBytes(Accelerator.Current, data);
|
||||
}
|
||||
|
||||
public static ulong HashBytes(Accelerator accelerator, byte[] data)
|
||||
{
|
||||
var initialSize = (data.Length >> 5) << 5;
|
||||
|
||||
var gpuData = accelerator.Allocate1D<byte>(initialSize);
|
||||
gpuData.CopyFromCPU(data);
|
||||
|
||||
ulong seed = 0;
|
||||
|
||||
var state = accelerator.Allocate1D<ulong>(4);
|
||||
var tmpState = new ulong[4];
|
||||
tmpState[0] = seed + Primes64[0] + Primes64[1];
|
||||
tmpState[1] = seed + Primes64[1];
|
||||
tmpState[2] = seed;
|
||||
tmpState[3] = seed - Primes64[0];
|
||||
state.CopyFromCPU(tmpState);
|
||||
|
||||
if (initialSize > 0)
|
||||
{
|
||||
var transformKernal = accelerator.LoadAutoGroupedStreamKernel<Index1D, ArrayView<ulong>, ArrayView<byte>, int>(TransformByteGroupsInternal);
|
||||
transformKernal(new Index1D(4), state.View, gpuData.View, initialSize);
|
||||
}
|
||||
|
||||
var cpuData = new ulong[4];
|
||||
state.View.CopyToCPU(cpuData);
|
||||
return FinalizeHashValueInternal(cpuData, data.AsSpan(initialSize..), (ulong)initialSize);
|
||||
}
|
||||
|
||||
private static void TransformByteGroupsInternal(Index1D index, ArrayView<ulong> state, ArrayView<byte> dataIn, int size)
|
||||
{
|
||||
var data = dataIn.Cast<ulong>();
|
||||
var temp = state[index.X];
|
||||
|
||||
var tempPrime0 = Prime0;
|
||||
var tempPrime1 = Prime1;
|
||||
|
||||
for (var idx = index.X; idx < data.Length; idx += 4)
|
||||
{
|
||||
temp += data[idx] * tempPrime1;
|
||||
temp = RotateLeft(temp, 31);
|
||||
temp *= tempPrime0;
|
||||
}
|
||||
|
||||
state[index.X] = temp;
|
||||
}
|
||||
|
||||
private static ulong FinalizeHashValueInternal(ulong[] hashState, ReadOnlySpan<byte> data, ulong bytesProcessed)
|
||||
{
|
||||
ulong hashValue;
|
||||
{
|
||||
if (bytesProcessed > 0)
|
||||
{
|
||||
var tempA = hashState[0];
|
||||
var tempB = hashState[1];
|
||||
var tempC = hashState[2];
|
||||
var tempD = hashState[3];
|
||||
|
||||
|
||||
hashValue = RotateLeft(tempA, 1) + RotateLeft(tempB, 7) + RotateLeft(tempC, 12) + RotateLeft(tempD, 18);
|
||||
|
||||
// A
|
||||
tempA *= Primes64[1];
|
||||
tempA = RotateLeft(tempA, 31);
|
||||
tempA *= Primes64[0];
|
||||
|
||||
hashValue ^= tempA;
|
||||
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||
|
||||
// B
|
||||
tempB *= Primes64[1];
|
||||
tempB = RotateLeft(tempB, 31);
|
||||
tempB *= Primes64[0];
|
||||
|
||||
hashValue ^= tempB;
|
||||
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||
|
||||
// C
|
||||
tempC *= Primes64[1];
|
||||
tempC = RotateLeft(tempC, 31);
|
||||
tempC *= Primes64[0];
|
||||
|
||||
hashValue ^= tempC;
|
||||
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||
|
||||
// D
|
||||
tempD *= Primes64[1];
|
||||
tempD = RotateLeft(tempD, 31);
|
||||
tempD *= Primes64[0];
|
||||
|
||||
hashValue ^= tempD;
|
||||
hashValue = hashValue * Primes64[0] + Primes64[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
hashValue = Seed + Primes64[4];
|
||||
}
|
||||
}
|
||||
|
||||
var remainderLength = data.Length;
|
||||
|
||||
hashValue += bytesProcessed + (ulong) remainderLength;
|
||||
|
||||
if (remainderLength > 0)
|
||||
{
|
||||
// In 8-byte chunks, process all full chunks
|
||||
for (var x = 0; x < data.Length / 8; ++x)
|
||||
{
|
||||
hashValue ^= RotateLeft(BitConverter.ToUInt64(data[(x * 8)..]) * Primes64[1], 31) * Primes64[0];
|
||||
hashValue = RotateLeft(hashValue, 27) * Primes64[0] + Primes64[3];
|
||||
}
|
||||
|
||||
// Process a 4-byte chunk if it exists
|
||||
if (remainderLength % 8 >= 4)
|
||||
{
|
||||
var startOffset = remainderLength - remainderLength % 8;
|
||||
|
||||
hashValue ^= BitConverter.ToUInt32(data[startOffset..]) * Primes64[0];
|
||||
hashValue = RotateLeft(hashValue, 23) * Primes64[1] + Primes64[2];
|
||||
}
|
||||
|
||||
// Process last 4 bytes in 1-byte chunks (only runs if data.Length % 4 != 0)
|
||||
{
|
||||
var startOffset = remainderLength - remainderLength % 4;
|
||||
var endOffset = remainderLength;
|
||||
|
||||
for (var currentOffset = startOffset; currentOffset < endOffset; currentOffset += 1)
|
||||
{
|
||||
hashValue ^= data[currentOffset] * Primes64[4];
|
||||
hashValue = RotateLeft(hashValue, 11) * Primes64[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hashValue ^= hashValue >> 33;
|
||||
hashValue *= Primes64[1];
|
||||
hashValue ^= hashValue >> 29;
|
||||
hashValue *= Primes64[2];
|
||||
hashValue ^= hashValue >> 32;
|
||||
|
||||
return hashValue;
|
||||
}
|
||||
|
||||
private static ulong RotateLeft(ulong operand, int shiftCount)
|
||||
{
|
||||
shiftCount &= 0x3f;
|
||||
|
||||
return
|
||||
(operand << shiftCount) |
|
||||
(operand >> (64 - shiftCount));
|
||||
}
|
||||
}
|
40
Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs
Normal file
40
Wabbajack.Hashing.xxHash64.GPU/AlgorithmState.cs
Normal file
@ -0,0 +1,40 @@
|
||||
namespace Wabbajack.Hashing.xxHash64.GPU;
|
||||
|
||||
public struct AlgorithmState
|
||||
{
|
||||
private static readonly IReadOnlyList<ulong> Primes64 =
|
||||
new[]
|
||||
{
|
||||
11400714785074694791UL,
|
||||
14029467366897019727UL,
|
||||
1609587929392839161UL,
|
||||
9650029242287828579UL,
|
||||
2870177450012600261UL
|
||||
};
|
||||
|
||||
private const ulong Prime0 = 11400714785074694791UL;
|
||||
private const ulong Prime1 = 14029467366897019727UL;
|
||||
private const ulong Prime2 = 1609587929392839161UL;
|
||||
private const ulong Prime3 = 9650029242287828579UL;
|
||||
private const ulong Prime4 = 2870177450012600261UL;
|
||||
|
||||
|
||||
internal readonly ulong Seed;
|
||||
|
||||
internal ulong A;
|
||||
internal ulong B;
|
||||
internal ulong C;
|
||||
internal ulong D;
|
||||
|
||||
internal ulong BytesProcessed;
|
||||
|
||||
public AlgorithmState(ulong seed)
|
||||
{
|
||||
Seed = seed;
|
||||
A = Seed + Primes64[0] + Primes64[1];
|
||||
B = Seed + Primes64[1];
|
||||
C = Seed;
|
||||
D = Seed - Primes64[0];
|
||||
BytesProcessed = 0;
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="ILGPU" Version="1.2.0" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
39
Wabbajack.Hashing.xxHash64.Test/GPUTests.cs
Normal file
39
Wabbajack.Hashing.xxHash64.Test/GPUTests.cs
Normal file
@ -0,0 +1,39 @@
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using ILGPU;
|
||||
using ILGPU.Runtime;
|
||||
using Wabbajack.Hashing.xxHash64.GPU;
|
||||
using Xunit;
|
||||
|
||||
namespace Wabbajack.Hashing.xxHash64.Test;
|
||||
|
||||
public class GPUTests
|
||||
{
|
||||
static GPUTests()
|
||||
{
|
||||
CurrentContext = Context.Create(b => b.Default().StaticFields(StaticFieldMode.MutableStaticFields | StaticFieldMode.IgnoreStaticFieldStores));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[MemberData(nameof(Accelerators))]
|
||||
public void CanHashData(Accelerator acc, byte[] data)
|
||||
{
|
||||
var result = Algorithm.HashBytes(acc, data);
|
||||
Assert.Equal(Hash.FromBase64("vBY6OyblpIw="), Hash.FromULong(result));
|
||||
}
|
||||
|
||||
|
||||
public static Context CurrentContext { get; set; }
|
||||
|
||||
|
||||
public static IEnumerable<object[]> Accelerators()
|
||||
{
|
||||
var random = new Random(42);
|
||||
var data = new byte[1024 * 1024 * 1024];
|
||||
random.NextBytes(data);
|
||||
return CurrentContext.Devices.Select(c => { return new object[] {c.CreateAccelerator(CurrentContext), data}; });
|
||||
}
|
||||
}
|
@ -23,6 +23,7 @@
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj" />
|
||||
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
|
@ -143,6 +143,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.App.Wpf", "Wabbaj
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.VFS.Interfaces", "Wabbajack.VFS.Interfaces\Wabbajack.VFS.Interfaces.csproj", "{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.Hashing.xxHash64.GPU", "Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj", "{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
@ -393,6 +395,10 @@ Global
|
||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
@ -442,6 +448,7 @@ Global
|
||||
{B10BB6D6-B3FC-4A76-8A07-6A0A0ADDE198} = {98B731EE-4FC0-4482-A069-BCBA25497871}
|
||||
{7FC4F129-F0FA-46B7-B7C4-532E371A6326} = {98B731EE-4FC0-4482-A069-BCBA25497871}
|
||||
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22} = {F677890D-5109-43BC-97C7-C4CD47C8EE0C}
|
||||
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10} = {B953DCDB-6D18-483F-BC38-1E4B1D3E12B5}
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {0AA30275-0F38-4A7D-B645-F5505178DDE8}
|
||||
|
Loading…
Reference in New Issue
Block a user