diff --git a/invokeai/app/invocations/flux_text_to_image.py b/invokeai/app/invocations/flux_text_to_image.py index 93d763428b..8b947b9a8d 100644 --- a/invokeai/app/invocations/flux_text_to_image.py +++ b/invokeai/app/invocations/flux_text_to_image.py @@ -79,8 +79,6 @@ class FluxTextToImageInvocation(BaseInvocation, WithMetadata, WithBoard): inference_dtype = torch.bfloat16 # Prepare input noise. - # TODO(ryand): Does the seed behave the same on different devices? Should we re-implement this to always use a - # CPU RNG? x = get_noise( num_samples=1, height=self.height, diff --git a/invokeai/backend/flux/sampling.py b/invokeai/backend/flux/sampling.py index 9917d63a8b..82abc0e561 100644 --- a/invokeai/backend/flux/sampling.py +++ b/invokeai/backend/flux/sampling.py @@ -20,16 +20,19 @@ def get_noise( dtype: torch.dtype, seed: int, ): + # We always generate noise on the same device and dtype then cast to ensure consistency across devices/dtypes. + rand_device = "cpu" + rand_dtype = torch.float16 return torch.randn( num_samples, 16, # allow for packing 2 * math.ceil(height / 16), 2 * math.ceil(width / 16), - device=device, - dtype=dtype, - generator=torch.Generator(device=device).manual_seed(seed), - ) + device=rand_device, + dtype=rand_dtype, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to(device=device, dtype=dtype) def prepare(t5: HFEncoder, clip: HFEncoder, img: Tensor, prompt: str | list[str]) -> dict[str, Tensor]: