Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 30 additions & 16 deletions src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,23 @@ public void Shuffle(ReadOnlySpan<byte> source, Span<byte> destination)

SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0);

Span<byte> temp = stackalloc byte[4];
ref byte t = ref MemoryMarshal.GetReference(temp);
ref uint tu = ref Unsafe.As<byte, uint>(ref t);

for (nuint i = 0, j = 0; i < (uint)source.Length; i += 3, j += 4)
{
ref byte s = ref Unsafe.Add(ref sBase, i);
tu = Unsafe.As<byte, uint>(ref s) | 0xFF000000;

Unsafe.Add(ref dBase, j + 0) = Unsafe.Add(ref t, p0);
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1);
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2);
Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3);
// Expanding 3-byte pixels to 4 bytes can overwrite the next source
// triplet when spans overlap. Assemble the padded pixel first, then
// shuffle from the staged uint.
uint packed =
Unsafe.Add(ref sBase, i + 0u) |
((uint)Unsafe.Add(ref sBase, i + 1u) << 8) |
((uint)Unsafe.Add(ref sBase, i + 2u) << 16) |
0xFF000000;

ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);

Unsafe.Add(ref dBase, j + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, j + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, j + 2u) = Unsafe.Add(ref pBase, p2);
Unsafe.Add(ref dBase, j + 3u) = Unsafe.Add(ref pBase, p3);
}
}
}
Expand All @@ -65,18 +69,28 @@ public void Shuffle(ReadOnlySpan<byte> source, Span<byte> destination)

while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
{
Unsafe.As<byte, uint>(ref dBase) = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000;
// The fast scalar path reads one extra byte past the source triplet.
// Keep that widened read in a local before writing the expanded pixel
// so overlapping destinations cannot change what was read.
uint packed = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000;

Unsafe.As<byte, uint>(ref dBase) = packed;

sBase = ref Unsafe.Add(ref sBase, 3);
dBase = ref Unsafe.Add(ref dBase, 4);
}

while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
{
Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0);
Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1);
Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2);
Unsafe.Add(ref dBase, 3) = byte.MaxValue;
// The final triplet cannot use the widened read above, so assemble
// the same padded uint byte-by-byte before the overlapping store.
uint packed =
Unsafe.Add(ref sBase, 0u) |
((uint)Unsafe.Add(ref sBase, 1u) << 8) |
((uint)Unsafe.Add(ref sBase, 2u) << 16) |
0xFF000000;

Unsafe.As<byte, uint>(ref dBase) = packed;

sBase = ref Unsafe.Add(ref sBase, 3);
dBase = ref Unsafe.Add(ref dBase, 4);
Expand Down
16 changes: 13 additions & 3 deletions src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,19 @@ public void Shuffle(ReadOnlySpan<byte> source, Span<byte> destination)

for (nuint i = 0; i < (uint)source.Length; i += 3)
{
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i);
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
// The scalar remainder can run in-place after the vector body. Load
// the full 3-byte pixel into a register-sized value before stores so
// channel swaps cannot corrupt later reads from the same pixel.
uint packed =
Unsafe.Add(ref sBase, i + 0u) |
((uint)Unsafe.Add(ref sBase, i + 1u) << 8) |
((uint)Unsafe.Add(ref sBase, i + 2u) << 16);

ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);

Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
}
}
}
14 changes: 10 additions & 4 deletions src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@ public void Shuffle(ReadOnlySpan<byte> source, Span<byte> destination)

for (nuint i = 0; i < (uint)source.Length; i += 4)
{
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i);
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
// The generic path may be used with source and destination pointing
// at the same pixel. Load all channels first so subsequent stores
// index only staged bytes, matching the specialized uint shuffles.
uint packed = Unsafe.As<byte, uint>(ref Unsafe.Add(ref sBase, i));
ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);

Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3);
}
}
}
Expand Down
34 changes: 26 additions & 8 deletions src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,15 @@ public void Shuffle(ReadOnlySpan<byte> source, Span<byte> destination)

for (nuint i = 0, j = 0; i < (uint)destination.Length; i += 3, j += 4)
{
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + j);
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j);
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j);
// Shrinking 4-byte pixels to 3 bytes can still be called in-place by
// tail code. Read the complete source pixel first, then write only
// the requested channels into the destination triplet.
uint packed = Unsafe.As<byte, uint>(ref Unsafe.Add(ref sBase, j));
ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);

Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
}
}
}
Expand All @@ -61,18 +67,30 @@ public void Shuffle(ReadOnlySpan<byte> source, Span<byte> destination)

while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
{
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0));
Unsafe.Add(ref dBase, 1) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 1));
Unsafe.Add(ref dBase, 2) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 2));
Unsafe.Add(ref dBase, 3) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 3));
// Stage the four source pixels before the 3-byte stores. Even
// though this path preserves XYZ order, the packed loads must happen
// before destination writes when the spans overlap.
uint packed0 = Unsafe.Add(ref sBase, 0u);
uint packed1 = Unsafe.Add(ref sBase, 1u);
uint packed2 = Unsafe.Add(ref sBase, 2u);
uint packed3 = Unsafe.Add(ref sBase, 3u);

Unsafe.Add(ref dBase, 0u) = Unsafe.As<uint, Byte3>(ref packed0);
Unsafe.Add(ref dBase, 1u) = Unsafe.As<uint, Byte3>(ref packed1);
Unsafe.Add(ref dBase, 2u) = Unsafe.As<uint, Byte3>(ref packed2);
Unsafe.Add(ref dBase, 3u) = Unsafe.As<uint, Byte3>(ref packed3);

sBase = ref Unsafe.Add(ref sBase, 4);
dBase = ref Unsafe.Add(ref dBase, 4);
}

while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
{
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0));
// Same overlap rule as the unrolled loop: take the 4-byte source
// pixel before storing the 3-byte destination value.
uint packed = Unsafe.Add(ref sBase, 0u);

Unsafe.Add(ref dBase, 0u) = Unsafe.As<uint, Byte3>(ref packed);

sBase = ref Unsafe.Add(ref sBase, 1);
dBase = ref Unsafe.Add(ref dBase, 1);
Expand Down
14 changes: 10 additions & 4 deletions src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

Expand Down Expand Up @@ -150,10 +151,15 @@ private static void Shuffle4Remainder(

for (nuint i = 0; i < (uint)source.Length; i += 4)
{
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i);
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
// Stage the scalar tail in a local Vector4 so p0..p3 index source
// values that were captured before any overlapping destination writes.
Vector4 v = Unsafe.As<float, Vector4>(ref Unsafe.Add(ref sBase, i));
ref float pBase = ref Unsafe.As<Vector4, float>(ref v);

Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3);
}
}

Expand Down
31 changes: 30 additions & 1 deletion src/ImageSharp/Compression/Zlib/ZlibInflateStream.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,19 @@ internal sealed class ZlibInflateStream : Stream
/// </summary>
private readonly Func<int> getData;

/// <summary>
/// When true, the inflated payload is treated as a raw DEFLATE stream with no zlib
/// CMF/FLG header (and no Adler-32 trailer). This is required to decode IDATs in
/// Apple's proprietary CgBI PNG variant.
/// </summary>
private readonly bool noHeader;

/// <summary>
/// Initializes a new instance of the <see cref="ZlibInflateStream"/> class.
/// </summary>
/// <param name="innerStream">The inner raw stream.</param>
public ZlibInflateStream(BufferedReadStream innerStream)
: this(innerStream, GetDataNoOp)
: this(innerStream, GetDataNoOp, noHeader: false)
{
}

Expand All @@ -67,9 +74,23 @@ public ZlibInflateStream(BufferedReadStream innerStream)
/// <param name="innerStream">The inner raw stream.</param>
/// <param name="getData">A delegate to get more data from the inner stream.</param>
public ZlibInflateStream(BufferedReadStream innerStream, Func<int> getData)
: this(innerStream, getData, noHeader: false)
{
}

/// <summary>
/// Initializes a new instance of the <see cref="ZlibInflateStream"/> class.
/// </summary>
/// <param name="innerStream">The inner raw stream.</param>
/// <param name="getData">A delegate to get more data from the inner stream.</param>
/// <param name="noHeader">
/// When <see langword="true"/>, the payload is treated as raw DEFLATE with no zlib header.
/// </param>
public ZlibInflateStream(BufferedReadStream innerStream, Func<int> getData, bool noHeader)
{
this.innerStream = innerStream;
this.getData = getData;
this.noHeader = noHeader;
}

/// <inheritdoc/>
Expand Down Expand Up @@ -210,6 +231,14 @@ protected override void Dispose(bool disposing)
[MemberNotNullWhen(true, nameof(CompressedStream))]
private bool InitializeInflateStream(bool isCriticalChunk)
{
// Apple CgBI IDATs omit the zlib CMF/FLG header and the Adler-32 trailer,
// wrapping a raw DEFLATE payload directly. Skip the header parsing in that mode.
if (this.noHeader)
{
this.CompressedStream = new DeflateStream(this, CompressionMode.Decompress, true);
return true;
}

// Read the zlib header : http://tools.ietf.org/html/rfc1950
// CMF(Compression Method and flags)
// This byte is divided into a 4 - bit compression method and a
Expand Down
Loading
Loading