Some failed attempts and seeing if we could make hashing go faster on the GPU

This commit is contained in:
Timothy Baldridge 2022-07-24 21:47:33 -06:00
parent 25041ab5b3
commit 95f7a6b335
8 changed files with 311 additions and 5 deletions

View File

@ -2,6 +2,11 @@
using System.Data.HashFunction.xxHash;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using ILGPU;
using ILGPU.Runtime;
using ILGPU.Runtime.Cuda;
using ILGPU.Runtime.OpenCL;
using Wabbajack.Hashing.xxHash64.GPU;
namespace Wabbajack.Hashing.xxHash64.Benchmark;
@ -9,7 +14,7 @@ internal class Program
{
private static void Main(string[] args)
{
BenchmarkRunner.Run<Base64EncoderBenchmark>();
BenchmarkRunner.Run<xxHashBenchmark>();
}
}
@ -18,10 +23,17 @@ internal class Program
public class xxHashBenchmark
{
private readonly byte[] _data;
private readonly Context _context;
private readonly Accelerator _gpu;
private readonly Accelerator _cpu;
public xxHashBenchmark()
{
_data = new byte[1024 * 1024];
_data = new byte[1024 * 1024 * 1024];
_context = Context.CreateDefault();
_gpu = _context.GetPreferredDevice(false).CreateAccelerator(_context);
_cpu = _context.GetPreferredDevice(true).CreateAccelerator(_context);
}
[Benchmark]
@ -37,6 +49,18 @@ public class xxHashBenchmark
var config = new xxHashConfig {HashSizeInBits = 64};
BitConverter.ToUInt64(xxHashFactory.Instance.Create(config).ComputeHash(_data).Hash);
}
[Benchmark]
public void GPUCode()
{
Algorithm.HashBytes(_gpu, _data);
}
[Benchmark]
public void CPUCode()
{
Algorithm.HashBytes(_cpu, _data);
}
}
[MemoryDiagnoser]

View File

@ -13,6 +13,7 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj" />
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj" />
</ItemGroup>

View File

@ -0,0 +1,181 @@
using ILGPU;
using ILGPU.Runtime;
using ILGPU.Util;
namespace Wabbajack.Hashing.xxHash64.GPU;
public class Algorithm
{
private static readonly ulong[] Primes64 =
{
11400714785074694791UL,
14029467366897019727UL,
1609587929392839161UL,
9650029242287828579UL,
2870177450012600261UL
};
private const ulong Prime0 = 11400714785074694791UL;
private const ulong Prime1 = 14029467366897019727UL;
private const ulong Prime2 = 1609587929392839161UL;
private const ulong Prime3 = 9650029242287828579UL;
private const ulong Prime4 = 2870177450012600261UL;
private const ulong Seed = 0L;
public ulong HashBytes(byte[] data)
{
return HashBytes(Accelerator.Current, data);
}
public static ulong HashBytes(Accelerator accelerator, byte[] data)
{
var initialSize = (data.Length >> 5) << 5;
var gpuData = accelerator.Allocate1D<byte>(initialSize);
gpuData.CopyFromCPU(data);
ulong seed = 0;
var state = accelerator.Allocate1D<ulong>(4);
var tmpState = new ulong[4];
tmpState[0] = seed + Primes64[0] + Primes64[1];
tmpState[1] = seed + Primes64[1];
tmpState[2] = seed;
tmpState[3] = seed - Primes64[0];
state.CopyFromCPU(tmpState);
if (initialSize > 0)
{
var transformKernal = accelerator.LoadAutoGroupedStreamKernel<Index1D, ArrayView<ulong>, ArrayView<byte>, int>(TransformByteGroupsInternal);
transformKernal(new Index1D(4), state.View, gpuData.View, initialSize);
}
var cpuData = new ulong[4];
state.View.CopyToCPU(cpuData);
return FinalizeHashValueInternal(cpuData, data.AsSpan(initialSize..), (ulong)initialSize);
}
private static void TransformByteGroupsInternal(Index1D index, ArrayView<ulong> state, ArrayView<byte> dataIn, int size)
{
var data = dataIn.Cast<ulong>();
var temp = state[index.X];
var tempPrime0 = Prime0;
var tempPrime1 = Prime1;
for (var idx = index.X; idx < data.Length; idx += 4)
{
temp += data[idx] * tempPrime1;
temp = RotateLeft(temp, 31);
temp *= tempPrime0;
}
state[index.X] = temp;
}
private static ulong FinalizeHashValueInternal(ulong[] hashState, ReadOnlySpan<byte> data, ulong bytesProcessed)
{
ulong hashValue;
{
if (bytesProcessed > 0)
{
var tempA = hashState[0];
var tempB = hashState[1];
var tempC = hashState[2];
var tempD = hashState[3];
hashValue = RotateLeft(tempA, 1) + RotateLeft(tempB, 7) + RotateLeft(tempC, 12) + RotateLeft(tempD, 18);
// A
tempA *= Primes64[1];
tempA = RotateLeft(tempA, 31);
tempA *= Primes64[0];
hashValue ^= tempA;
hashValue = hashValue * Primes64[0] + Primes64[3];
// B
tempB *= Primes64[1];
tempB = RotateLeft(tempB, 31);
tempB *= Primes64[0];
hashValue ^= tempB;
hashValue = hashValue * Primes64[0] + Primes64[3];
// C
tempC *= Primes64[1];
tempC = RotateLeft(tempC, 31);
tempC *= Primes64[0];
hashValue ^= tempC;
hashValue = hashValue * Primes64[0] + Primes64[3];
// D
tempD *= Primes64[1];
tempD = RotateLeft(tempD, 31);
tempD *= Primes64[0];
hashValue ^= tempD;
hashValue = hashValue * Primes64[0] + Primes64[3];
}
else
{
hashValue = Seed + Primes64[4];
}
}
var remainderLength = data.Length;
hashValue += bytesProcessed + (ulong) remainderLength;
if (remainderLength > 0)
{
// In 8-byte chunks, process all full chunks
for (var x = 0; x < data.Length / 8; ++x)
{
hashValue ^= RotateLeft(BitConverter.ToUInt64(data[(x * 8)..]) * Primes64[1], 31) * Primes64[0];
hashValue = RotateLeft(hashValue, 27) * Primes64[0] + Primes64[3];
}
// Process a 4-byte chunk if it exists
if (remainderLength % 8 >= 4)
{
var startOffset = remainderLength - remainderLength % 8;
hashValue ^= BitConverter.ToUInt32(data[startOffset..]) * Primes64[0];
hashValue = RotateLeft(hashValue, 23) * Primes64[1] + Primes64[2];
}
// Process last 4 bytes in 1-byte chunks (only runs if data.Length % 4 != 0)
{
var startOffset = remainderLength - remainderLength % 4;
var endOffset = remainderLength;
for (var currentOffset = startOffset; currentOffset < endOffset; currentOffset += 1)
{
hashValue ^= data[currentOffset] * Primes64[4];
hashValue = RotateLeft(hashValue, 11) * Primes64[0];
}
}
}
hashValue ^= hashValue >> 33;
hashValue *= Primes64[1];
hashValue ^= hashValue >> 29;
hashValue *= Primes64[2];
hashValue ^= hashValue >> 32;
return hashValue;
}
private static ulong RotateLeft(ulong operand, int shiftCount)
{
shiftCount &= 0x3f;
return
(operand << shiftCount) |
(operand >> (64 - shiftCount));
}
}

View File

@ -0,0 +1,40 @@
namespace Wabbajack.Hashing.xxHash64.GPU;
public struct AlgorithmState
{
private static readonly IReadOnlyList<ulong> Primes64 =
new[]
{
11400714785074694791UL,
14029467366897019727UL,
1609587929392839161UL,
9650029242287828579UL,
2870177450012600261UL
};
private const ulong Prime0 = 11400714785074694791UL;
private const ulong Prime1 = 14029467366897019727UL;
private const ulong Prime2 = 1609587929392839161UL;
private const ulong Prime3 = 9650029242287828579UL;
private const ulong Prime4 = 2870177450012600261UL;
internal readonly ulong Seed;
internal ulong A;
internal ulong B;
internal ulong C;
internal ulong D;
internal ulong BytesProcessed;
public AlgorithmState(ulong seed)
{
Seed = seed;
A = Seed + Primes64[0] + Primes64[1];
B = Seed + Primes64[1];
C = Seed;
D = Seed - Primes64[0];
BytesProcessed = 0;
}
}

View File

@ -0,0 +1,13 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="ILGPU" Version="1.2.0" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,39 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using ILGPU;
using ILGPU.Runtime;
using Wabbajack.Hashing.xxHash64.GPU;
using Xunit;
namespace Wabbajack.Hashing.xxHash64.Test;
public class GPUTests
{
static GPUTests()
{
CurrentContext = Context.Create(b => b.Default().StaticFields(StaticFieldMode.MutableStaticFields | StaticFieldMode.IgnoreStaticFieldStores));
}
[Theory]
[MemberData(nameof(Accelerators))]
public void CanHashData(Accelerator acc, byte[] data)
{
var result = Algorithm.HashBytes(acc, data);
Assert.Equal(Hash.FromBase64("vBY6OyblpIw="), Hash.FromULong(result));
}
public static Context CurrentContext { get; set; }
public static IEnumerable<object[]> Accelerators()
{
var random = new Random(42);
var data = new byte[1024 * 1024 * 1024];
random.NextBytes(data);
return CurrentContext.Devices.Select(c => { return new object[] {c.CreateAccelerator(CurrentContext), data}; });
}
}

View File

@ -23,6 +23,7 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj" />
<ProjectReference Include="..\Wabbajack.Hashing.xxHash64\Wabbajack.Hashing.xxHash64.csproj" />
</ItemGroup>

View File

@ -143,6 +143,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.App.Wpf", "Wabbaj
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.VFS.Interfaces", "Wabbajack.VFS.Interfaces\Wabbajack.VFS.Interfaces.csproj", "{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wabbajack.Hashing.xxHash64.GPU", "Wabbajack.Hashing.xxHash64.GPU\Wabbajack.Hashing.xxHash64.GPU.csproj", "{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@ -393,6 +395,10 @@ Global
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22}.Release|Any CPU.Build.0 = Release|Any CPU
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Debug|Any CPU.Build.0 = Debug|Any CPU
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.ActiveCfg = Release|Any CPU
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@ -442,6 +448,7 @@ Global
{B10BB6D6-B3FC-4A76-8A07-6A0A0ADDE198} = {98B731EE-4FC0-4482-A069-BCBA25497871}
{7FC4F129-F0FA-46B7-B7C4-532E371A6326} = {98B731EE-4FC0-4482-A069-BCBA25497871}
{E4BDB22D-11A4-452F-8D10-D9CA9777EA22} = {F677890D-5109-43BC-97C7-C4CD47C8EE0C}
{7A06F752-2D2E-412F-BEDF-D2B4A1A0DE10} = {B953DCDB-6D18-483F-BC38-1E4B1D3E12B5}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {0AA30275-0F38-4A7D-B645-F5505178DDE8}