FloppyDisk
/
asw-render-pipelines-universal


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684
							using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using System.Runtime.CompilerServices;
using Unity.Mathematics;
using static Unity.Mathematics.math;

namespace UnityEngine.Rendering.Universal.Internal
{
    // This structure is designed to be Burst friendly.
    // It can be copied by value.
    internal struct DeferredTiler
    {
        // Precomputed light data
        internal struct PrePunctualLight
        {
            // view-space position.
            public float3 posVS;
            // Radius in world unit.
            public float radius;
            // Distance between closest bound of the light and the camera. Used for sorting lights front-to-back.
            public float minDist;
            // Projected position of the sphere centre on the screen (near plane).
            public float2 screenPos;
            // Index into renderingData.lightData.visibleLights native array.
            public ushort visLightIndex;
        }

        enum ClipResult
        {
            Unknown,
            In,
            Out,
        }

        int m_TilePixelWidth;
        int m_TilePixelHeight;
        int m_TileXCount;
        int m_TileYCount;
        // Fixed header size in uint in m_TileHeader.
        // Only finest tiler requires to store extra per-tile information (light list depth range, bitmask for 2.5D culling).
        int m_TileHeaderSize;
        // Indicative average lights per tile. Only used when initializing the size of m_DataTile for the first time.
        int m_AvgLightPerTile;
        // 0, 1 or 2 (see DeferredConfig.kTilerDepth)
        int m_TilerLevel;

        // Camera frustum planes, adjusted to account for tile size.
        FrustumPlanes m_FrustumPlanes;
        // Are we dealing with an orthographic projection.
        bool m_IsOrthographic;

        // Atomic counters are put in a NativeArray so they can be accessed/shared from jobs.
        // [0] maxLightPerTile: Only valid for finest tiler: max light counter per tile. Reset every frame.
        // [1] tileDataSize: reset every frame.
        // [2] tileDataCapacity: extra amount of memory required by each tiler (depends on number of lights visible). Externally maintained.
        [Unity.Collections.LowLevel.Unsafe.NativeDisableContainerSafetyRestriction]
        NativeArray<int> m_Counters;

        // Store all visible light indices for all tiles.
        // (currently) Contains sequential blocks of ushort values (light indices and optionally lightDepthRange), for each tile
        // For example for platforms using 16x16px tiles:
        // in a finest        tiler DeferredLights.m_Tilers[0] ( 16x16px  tiles), each tile will use a block of  1 *  1 * 32 =   32 ushort values
        // in an intermediate tiler DeferredLights.m_Tilers[1] ( 64x64px  tiles), each tile will use a block of  4 *  4 * 32 =  512 ushort values
        // in a coarsest      tiler DeferredLights.m_Tilers[2] (256x256px tiles), each tile will use a block of 16 * 16 * 32 = 8192 ushort values
        [Unity.Collections.LowLevel.Unsafe.NativeDisableContainerSafetyRestriction]
        NativeArray<ushort> m_TileData;

        // Store tile header (fixed size per tile)
        // light offset, light count, optionally additional per-tile "header" values.
        [Unity.Collections.LowLevel.Unsafe.NativeDisableContainerSafetyRestriction]
        NativeArray<uint> m_TileHeaders;

        // Precompute tile data.
        [Unity.Collections.LowLevel.Unsafe.NativeDisableContainerSafetyRestriction]
        NativeArray<PreTile> m_PreTiles;

        public DeferredTiler(int tilePixelWidth, int tilePixelHeight, int avgLightPerTile, int tilerLevel)
        {
            m_TilePixelWidth = tilePixelWidth;
            m_TilePixelHeight = tilePixelHeight;
            m_TileXCount = 0;
            m_TileYCount = 0;
            // Finest tiler (at index 0) computes extra tile data stored into the header, so it requires more space. See CullFinalLights() vs CullIntermediateLights().
            // Finest tiler: lightListOffset, lightCount, listDepthRange, listBitMask
            // Coarse tilers: lightListOffset, lightCount
            m_TileHeaderSize = tilerLevel == 0 ? 4 : 2;
            m_AvgLightPerTile = avgLightPerTile;
            m_TilerLevel = tilerLevel;
            m_FrustumPlanes = new FrustumPlanes { left = 0, right = 0, bottom = 0, top = 0, zNear = 0, zFar = 0 };
            m_IsOrthographic = false;
            m_Counters = new NativeArray<int>();
            m_TileData = new NativeArray<ushort>();
            m_TileHeaders = new NativeArray<uint>();
            m_PreTiles = new NativeArray<PreTile>();
        }

        public int TilerLevel
        {
            get { return m_TilerLevel; }
        }

        public int TileXCount
        {
            get { return m_TileXCount; }
        }

        public int TileYCount
        {
            get { return m_TileYCount; }
        }

        public int TilePixelWidth
        {
            get { return m_TilePixelWidth; }
        }

        public int TilePixelHeight
        {
            get { return m_TilePixelHeight; }
        }

        public int TileHeaderSize
        {
            get { return m_TileHeaderSize; }
        }

        public int MaxLightPerTile
        {
            get { return m_Counters.IsCreated ? m_Counters[0] : 0; }
        }

        public int TileDataCapacity
        {
            get { return m_Counters.IsCreated ? m_Counters[2] : 0; }
        }

        public NativeArray<ushort> Tiles
        {
            get { return m_TileData; }
        }

        public NativeArray<uint> TileHeaders
        {
            get { return m_TileHeaders; }
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void GetTileOffsetAndCount(int i, int j, out int offset, out int count)
        {
            int headerOffset = GetTileHeaderOffset(i, j);
            offset = (int)m_TileHeaders[headerOffset + 0];
            count = (int)m_TileHeaders[headerOffset + 1];
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public int GetTileHeaderOffset(int i, int j)
        {
            return (i + j * m_TileXCount) * m_TileHeaderSize;
        }

        public void Setup(int tileDataCapacity)
        {
            if (tileDataCapacity <= 0)
                tileDataCapacity = m_TileXCount * m_TileYCount * m_AvgLightPerTile;

            m_Counters = new NativeArray<int>(3, Allocator.Temp, NativeArrayOptions.UninitializedMemory);
            m_TileData = new NativeArray<ushort>(tileDataCapacity, Allocator.Temp, NativeArrayOptions.UninitializedMemory);
            m_TileHeaders = new NativeArray<uint>(m_TileXCount * m_TileYCount * m_TileHeaderSize, Allocator.Temp, NativeArrayOptions.UninitializedMemory);

            m_Counters[0] = 0;
            m_Counters[1] = 0;
            m_Counters[2] = tileDataCapacity;
        }

        public void OnCameraCleanup()
        {
            if (m_TileHeaders.IsCreated)
                m_TileHeaders.Dispose();
            if (m_TileData.IsCreated)
                m_TileData.Dispose();
            if (m_Counters.IsCreated)
                m_Counters.Dispose();
        }

        public void PrecomputeTiles(Matrix4x4 proj, bool isOrthographic, int renderWidth, int renderHeight)
        {
            m_TileXCount = (renderWidth + m_TilePixelWidth - 1) / m_TilePixelWidth;
            m_TileYCount = (renderHeight + m_TilePixelHeight - 1) / m_TilePixelHeight;

            m_PreTiles = DeferredShaderData.instance.GetPreTiles(m_TilerLevel, m_TileXCount * m_TileYCount);

            // Adjust render width and height to account for tile size expanding over the screen (tiles have a fixed pixel size).
            int adjustedRenderWidth = Align(renderWidth, m_TilePixelWidth);
            int adjustedRenderHeight = Align(renderHeight, m_TilePixelHeight);

            // Now adjust the right and bottom clipping planes.
            m_FrustumPlanes = proj.decomposeProjection;
            m_FrustumPlanes.right = m_FrustumPlanes.left + (m_FrustumPlanes.right - m_FrustumPlanes.left) * (adjustedRenderWidth / (float)renderWidth);
            m_FrustumPlanes.bottom = m_FrustumPlanes.top + (m_FrustumPlanes.bottom - m_FrustumPlanes.top) * (adjustedRenderHeight / (float)renderHeight);
            m_IsOrthographic = isOrthographic;

            // Tile size in world units.
            float tileWidthWS = (m_FrustumPlanes.right - m_FrustumPlanes.left) / m_TileXCount;
            float tileHeightWS = (m_FrustumPlanes.top - m_FrustumPlanes.bottom) / m_TileYCount;

            if (!isOrthographic) // perspective
            {
                for (int j = 0; j < m_TileYCount; ++j)
                {
                    float tileTop = m_FrustumPlanes.top - tileHeightWS * j;
                    float tileBottom = tileTop - tileHeightWS;

                    for (int i = 0; i < m_TileXCount; ++i)
                    {
                        float tileLeft = m_FrustumPlanes.left + tileWidthWS * i;
                        float tileRight = tileLeft + tileWidthWS;

                        // Camera view space is always OpenGL RH coordinates system.
                        // In view space with perspective projection, all planes pass by (0,0,0).
                        PreTile preTile;
                        preTile.planeLeft = MakePlane(new float3(tileLeft, tileBottom, -m_FrustumPlanes.zNear), new float3(tileLeft, tileTop, -m_FrustumPlanes.zNear));
                        preTile.planeRight = MakePlane(new float3(tileRight, tileTop, -m_FrustumPlanes.zNear), new float3(tileRight, tileBottom, -m_FrustumPlanes.zNear));
                        preTile.planeBottom = MakePlane(new float3(tileRight, tileBottom, -m_FrustumPlanes.zNear), new float3(tileLeft, tileBottom, -m_FrustumPlanes.zNear));
                        preTile.planeTop = MakePlane(new float3(tileLeft, tileTop, -m_FrustumPlanes.zNear), new float3(tileRight, tileTop, -m_FrustumPlanes.zNear));

                        m_PreTiles[i + j * m_TileXCount] = preTile;
                    }
                }
            }
            else
            {
                for (int j = 0; j < m_TileYCount; ++j)
                {
                    float tileTop = m_FrustumPlanes.top - tileHeightWS * j;
                    float tileBottom = tileTop - tileHeightWS;

                    for (int i = 0; i < m_TileXCount; ++i)
                    {
                        float tileLeft = m_FrustumPlanes.left + tileWidthWS * i;
                        float tileRight = tileLeft + tileWidthWS;

                        // Camera view space is always OpenGL RH coordinates system.
                        PreTile preTile;
                        preTile.planeLeft = MakePlane(new float3(tileLeft, tileBottom, -m_FrustumPlanes.zNear), new float3(tileLeft, tileBottom, -m_FrustumPlanes.zNear - 1.0f), new float3(tileLeft, tileTop, -m_FrustumPlanes.zNear));
                        preTile.planeRight = MakePlane(new float3(tileRight, tileTop, -m_FrustumPlanes.zNear), new float3(tileRight, tileTop, -m_FrustumPlanes.zNear - 1.0f), new float3(tileRight, tileBottom, -m_FrustumPlanes.zNear));
                        preTile.planeBottom = MakePlane(new float3(tileRight, tileBottom, -m_FrustumPlanes.zNear), new float3(tileRight, tileBottom, -m_FrustumPlanes.zNear - 1.0f), new float3(tileLeft, tileBottom, -m_FrustumPlanes.zNear));
                        preTile.planeTop = MakePlane(new float3(tileLeft, tileTop, -m_FrustumPlanes.zNear), new float3(tileLeft, tileTop, -m_FrustumPlanes.zNear - 1.0f), new float3(tileRight, tileTop, -m_FrustumPlanes.zNear));

                        m_PreTiles[i + j * m_TileXCount] = preTile;
                    }
                }
            }
        }

        // This differs from CullIntermediateLights in 3 ways:
        // - tile-frustums/light intersection use different algorithm
        // - depth range of the light shape intersecting the tile-frustums is output in the tile list header section
        // - light indices written out are indexing visible_lights, rather than the array of PrePunctualLights.
        unsafe public void CullFinalLights(ref NativeArray<PrePunctualLight> punctualLights,
            ref NativeArray<ushort> lightIndices, int lightStartIndex, int lightCount,
            int istart, int iend, int jstart, int jend)
        {
            // Interestingly, 2-3% faster when using unsafe arrays.
            PrePunctualLight* _punctualLights = (PrePunctualLight*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(punctualLights);
            ushort* _lightIndices = (ushort*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(lightIndices);
            uint* _tileHeaders = (uint*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(m_TileHeaders);

            if (lightCount == 0)
            {
                for (int j = jstart; j < jend; ++j)
                    for (int i = istart; i < iend; ++i)
                    {
                        int headerOffset = GetTileHeaderOffset(i, j);
                        _tileHeaders[headerOffset + 0] = 0;
                        _tileHeaders[headerOffset + 1] = 0;
                        _tileHeaders[headerOffset + 2] = 0;
                        _tileHeaders[headerOffset + 3] = 0;
                    }
                return;
            }

            // Store culled lights in temporary buffer. Additionally store depth range of each light for a given tile too.
            // the depth range is a 32bit mask, but packed into a 16bits value since the range of the light is continuous
            // (only need to store first bit enabled, and count of enabled bits).
            ushort* tiles = stackalloc ushort[lightCount * 2];
            float2* depthRanges = stackalloc float2[lightCount];

            int maxLightPerTile = 0; // for stats
            int lightEndIndex = lightStartIndex + lightCount;
            float2 tileSize = new float2((m_FrustumPlanes.right - m_FrustumPlanes.left) / m_TileXCount, (m_FrustumPlanes.top - m_FrustumPlanes.bottom) / m_TileYCount);
            float2 tileExtents = tileSize * 0.5f;
            float2 tileExtentsInv = new float2(1.0f / tileExtents.x, 1.0f / tileExtents.y);

            for (int j = jstart; j < jend; ++j)
            {
                float tileYCentre = m_FrustumPlanes.top - (tileExtents.y + j * tileSize.y);

                for (int i = istart; i < iend; ++i)
                {
                    float tileXCentre = m_FrustumPlanes.left + tileExtents.x + i * tileSize.x;

                    PreTile preTile = m_PreTiles[i + j * m_TileXCount];
                    int culledLightCount = 0;

                    // For the current tile's light list, min&max depth range (absolute values).
                    float listMinDepth = float.MaxValue;
                    float listMaxDepth = -float.MaxValue;

                    // Duplicate the inner loop twice. Testing for the ortographic case inside the inner loop would cost an extra 8% otherwise.
                    // Missing C++ template argument here!
                    if (!m_IsOrthographic)
                    {
                        for (int vi = lightStartIndex; vi < lightEndIndex; ++vi)
                        {
                            ushort lightIndex = _lightIndices[vi];
                            PrePunctualLight ppl = _punctualLights[lightIndex];

                            // Offset tileCentre toward the light to calculate a more conservative minMax depth bound,
                            // but it must remains inside the tile and must not pass further than the light centre.
                            float2 tileCentre = new float2(tileXCentre, tileYCentre);
                            float2 dir = ppl.screenPos - tileCentre;
                            float2 d = abs(dir * tileExtentsInv);

                            float sInv = 1.0f / max3(d.x, d.y, 1.0f);
                            float3 tileOffCentre = new float3(tileCentre.x + dir.x * sInv, tileCentre.y + dir.y * sInv, -m_FrustumPlanes.zNear);
                            float3 tileOrigin = new float3(0.0f);

                            float t0, t1;
                            // This is more expensive than Clip() but allow to compute min&max depth range for the part of the light inside the tile.
                            if (!IntersectionLineSphere(ppl.posVS, ppl.radius, tileOrigin, tileOffCentre, out t0, out t1))
                                continue;

                            listMinDepth = listMinDepth < t0 ? listMinDepth : t0;
                            listMaxDepth = listMaxDepth > t1 ? listMaxDepth : t1;
                            depthRanges[culledLightCount] = new float2(t0, t1);
                            // Because this always output to the finest tiles, contrary to CullLights(),
                            // the result are indices into visibleLights, instead of indices into punctualLights.
                            tiles[culledLightCount] = ppl.visLightIndex;
                            ++culledLightCount;
                        }
                    }
                    else
                    {
                        for (int vi = lightStartIndex; vi < lightEndIndex; ++vi)
                        {
                            ushort lightIndex = _lightIndices[vi];
                            PrePunctualLight ppl = _punctualLights[lightIndex];

                            // Offset tileCentre toward the light to calculate a more conservative minMax depth bound,
                            // but it must remains inside the tile and must not pass further than the light centre.
                            float2 tileCentre = new float2(tileXCentre, tileYCentre);
                            float2 dir = ppl.screenPos - tileCentre;
                            float2 d = abs(dir * tileExtentsInv);

                            float sInv = 1.0f / max3(d.x, d.y, 1.0f);
                            float3 tileOffCentre = new float3(0, 0, -m_FrustumPlanes.zNear);
                            float3 tileOrigin = new float3(tileCentre.x + dir.x * sInv, tileCentre.y + dir.y * sInv, 0.0f);

                            float t0, t1;
                            // This is more expensive than Clip() but allow to compute min&max depth range for the part of the light inside the tile.
                            if (!IntersectionLineSphere(ppl.posVS, ppl.radius, tileOrigin, tileOffCentre, out t0, out t1))
                                continue;

                            listMinDepth = listMinDepth < t0 ? listMinDepth : t0;
                            listMaxDepth = listMaxDepth > t1 ? listMaxDepth : t1;
                            depthRanges[culledLightCount] = new float2(t0, t1);
                            // Because this always output to the finest tiles, contrary to CullLights(),
                            // the result are indices into visibleLights, instead of indices into punctualLights.
                            tiles[culledLightCount] = ppl.visLightIndex;
                            ++culledLightCount;
                        }
                    }

                    // Post-multiply by zNear to get actual world unit absolute depth values, then clamp to valid depth range.
                    listMinDepth = max2(listMinDepth * m_FrustumPlanes.zNear, m_FrustumPlanes.zNear);
                    listMaxDepth = min2(listMaxDepth * m_FrustumPlanes.zNear, m_FrustumPlanes.zFar);

                    // Calculate bitmask for 2.5D culling.
                    uint bitMask = 0;
                    float depthRangeInv = 1.0f / (listMaxDepth - listMinDepth);
                    for (int culledLightIndex = 0; culledLightIndex < culledLightCount; ++culledLightIndex)
                    {
                        float lightMinDepth = max2(depthRanges[culledLightIndex].x * m_FrustumPlanes.zNear, m_FrustumPlanes.zNear);
                        float lightMaxDepth = min2(depthRanges[culledLightIndex].y * m_FrustumPlanes.zNear, m_FrustumPlanes.zFar);
                        int firstBit = (int)((lightMinDepth - listMinDepth) * 32.0f * depthRangeInv);
                        int lastBit = (int)((lightMaxDepth - listMinDepth) * 32.0f * depthRangeInv);
                        int bitCount = min(lastBit - firstBit + 1, 32 - firstBit);
                        bitMask |= (uint)((0xFFFFFFFF >> (32 - bitCount)) << firstBit);

                        tiles[culledLightCount + culledLightIndex] = (ushort)((uint)firstBit | (uint)(bitCount << 8));
                    }

                    // As listMinDepth and listMaxDepth are used to calculate the geometry 2.5D bitmask,
                    // we can optimize the shader execution (TileDepthInfo.shader) by refactoring the calculation.
                    //   int bitIndex = 32.0h * (geoDepth - listMinDepth) / (listMaxDepth - listMinDepth);
                    // Equivalent to:
                    //   a =                 32.0 / (listMaxDepth - listMinDepth)
                    //   b = -listMinDepth * 32.0 / (listMaxDepth - listMinDepth)
                    //   int bitIndex = geoDepth * a + b;
                    float a = 32.0f * depthRangeInv;
                    float b = -listMinDepth * a;

                    int tileDataSize = culledLightCount * 2;
                    int tileOffset = culledLightCount > 0 ? AddTileData(tiles, ref tileDataSize) : 0;

                    int headerOffset = GetTileHeaderOffset(i, j);
                    _tileHeaders[headerOffset + 0] = (uint)tileOffset;
                    _tileHeaders[headerOffset + 1] = (uint)(tileDataSize == 0 ? 0 : culledLightCount);
                    _tileHeaders[headerOffset + 2] = _f32tof16(a) | (_f32tof16(b) << 16);
                    _tileHeaders[headerOffset + 3] = bitMask;

                    maxLightPerTile = max(maxLightPerTile, culledLightCount);
                }
            }

            m_Counters[0] = max(m_Counters[0], maxLightPerTile); // TODO make it atomic
        }

        // TODO: finer culling for spot lights
        unsafe public void CullIntermediateLights(ref NativeArray<PrePunctualLight> punctualLights,
            ref NativeArray<ushort> lightIndices, int lightStartIndex, int lightCount,
            int istart, int iend, int jstart, int jend)
        {
            // Interestingly, 2-3% faster when using unsafe arrays.
            PrePunctualLight* _punctualLights = (PrePunctualLight*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(punctualLights);
            ushort* _lightIndices = (ushort*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(lightIndices);
            uint* _tileHeaders = (uint*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks(m_TileHeaders);

            if (lightCount == 0)
            {
                for (int j = jstart; j < jend; ++j)
                    for (int i = istart; i < iend; ++i)
                    {
                        int headerOffset = GetTileHeaderOffset(i, j);
                        _tileHeaders[headerOffset + 0] = 0;
                        _tileHeaders[headerOffset + 1] = 0;
                    }
                return;
            }

            // Store culled result in temporary buffer.
            ushort* tiles = stackalloc ushort[lightCount];

            int lightEndIndex = lightStartIndex + lightCount;

            for (int j = jstart; j < jend; ++j)
            {
                for (int i = istart; i < iend; ++i)
                {
                    PreTile preTile = m_PreTiles[i + j * m_TileXCount];
                    int culledLightCount = 0;

                    for (int vi = lightStartIndex; vi < lightEndIndex; ++vi)
                    {
                        ushort lightIndex = _lightIndices[vi];
                        PrePunctualLight ppl = _punctualLights[lightIndex];

                        // This is slightly faster than IntersectionLineSphere().
                        if (!Clip(ref preTile, ppl.posVS, ppl.radius))
                            continue;

                        tiles[culledLightCount] = lightIndex;
                        ++culledLightCount;
                    }

                    // Copy the culled light list.
                    int tileOffset = culledLightCount > 0 ? AddTileData(tiles, ref culledLightCount) : 0;

                    int headerOffset = GetTileHeaderOffset(i, j);
                    _tileHeaders[headerOffset + 0] = (uint)tileOffset;
                    _tileHeaders[headerOffset + 1] = (uint)culledLightCount;
                }
            }
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        unsafe int AddTileData(ushort* lightData, ref int size)
        {
            int* _Counters = (int*)m_Counters.GetUnsafePtr();
            int tileDataSize = System.Threading.Interlocked.Add(ref _Counters[1], size);
            int offset = tileDataSize - size;

            if (tileDataSize <= m_TileData.Length)
            {
                ushort* _TileData = (ushort*)m_TileData.GetUnsafePtr();
                UnsafeUtility.MemCpy(_TileData + offset, lightData, size * 2);
                return offset;
            }
            else
            {
                // Buffer overflow. Ignore data to add.
                // Gracefully increasing the buffer size is possible but costs extra CPU time (see commented code below) due to the needed critical section.

                m_Counters[2] = max(m_Counters[2], tileDataSize); // use an atomic max instead?
                size = 0;
                return 0;
            }

            /*
            lock (this)
            {
                int offset = m_TileDataSize;
                m_TileDataSize += size;
                ushort* _TileData = (ushort*)m_TileData.GetUnsafePtr();

                if (m_TileDataSize > m_TileDataCapacity)
                {
                    m_TileDataCapacity = max(m_TileDataSize, m_TileDataCapacity * 2);
                    NativeArray<ushort> newTileData = new NativeArray<ushort>(m_TileDataCapacity, Allocator.Temp, NativeArrayOptions.UninitializedMemory);
                    ushort* _newTileData = (ushort*)newTileData.GetUnsafePtr();

                    UnsafeUtility.MemCpy(_newTileData, _TileData, offset * 2);

                    m_TileData.Dispose();
                    m_TileData = newTileData;
                    _TileData = _newTileData;
                }

                UnsafeUtility.MemCpy(_TileData + offset, lightData, size * 2);

                return offset;
            }
            */
        }

        // Return parametric intersection between a sphere and a line.
        // The intersections points P0 and P1 are:
        // P0 = raySource + rayDirection * t0.
        // P1 = raySource + rayDirection * t1.
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        unsafe static bool IntersectionLineSphere(float3 centre, float radius, float3 raySource, float3 rayDirection, out float t0, out float t1)
        {
            float A = dot(rayDirection, rayDirection); // always >= 0
            float B = dot(raySource - centre, rayDirection);
            float C = dot(raySource, raySource)
                + dot(centre, centre)
                - (radius * radius)
                - 2 * dot(raySource, centre);
            float discriminant = (B * B) - A * C;
            if (discriminant > 0)
            {
                float sqrt_discriminant = sqrt(discriminant);
                float A_inv = 1.0f / A;
                t0 = (-B - sqrt_discriminant) * A_inv;
                t1 = (-B + sqrt_discriminant) * A_inv;
                return true;
            }
            else
            {
                t0 = 0.0f; // invalid
                t1 = 0.0f; // invalid
                return false;
            }
        }

        // Clip a sphere against a 2D tile. Near and far planes are ignored (already tested).
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static bool Clip(ref PreTile tile, float3 posVS, float radius)
        {
            // Simplified clipping code, only deals with 4 clipping planes.
            // zNear and zFar clipping planes are ignored as presumably the light is already visible to the camera frustum.

            float radiusSq = radius * radius;
            int insideCount = 0;
            ClipResult res;

            res = ClipPartial(tile.planeLeft, tile.planeBottom, tile.planeTop, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
                return res == ClipResult.In;

            res = ClipPartial(tile.planeRight, tile.planeBottom, tile.planeTop, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
                return res == ClipResult.In;

            res = ClipPartial(tile.planeTop, tile.planeLeft, tile.planeRight, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
                return res == ClipResult.In;

            res = ClipPartial(tile.planeBottom, tile.planeLeft, tile.planeRight, posVS, radius, radiusSq, ref insideCount);
            if (res != ClipResult.Unknown)
                return res == ClipResult.In;

            return insideCount == 4;
        }

        // Internal function to clip against 1 plane of a cube, with additional 2 side planes for false-positive detection (normally 4 planes, but near and far planes are ignored).
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static ClipResult ClipPartial(float4 plane, float4 sidePlaneA, float4 sidePlaneB, float3 posVS, float radius, float radiusSq, ref int insideCount)
        {
            float d = DistanceToPlane(plane, posVS);
            if (d + radius <= 0.0f) // completely outside
                return ClipResult.Out;
            else if (d < 0.0f) // intersection: further check: only need to consider case where more than half the sphere is outside
            {
                float3 p = posVS - plane.xyz * d;
                float rSq = radiusSq - d * d;
                if (SignedSq(DistanceToPlane(sidePlaneA, p)) >= -rSq
                    && SignedSq(DistanceToPlane(sidePlaneB, p)) >= -rSq)
                    return ClipResult.In;
            }
            else // consider as good as completely inside
                ++insideCount;

            return ClipResult.Unknown;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static float4 MakePlane(float3 pb, float3 pc)
        {
            float3 v0 = pb;
            float3 v1 = pc;
            float3 n = cross(v0, v1);
            n = normalize(n);

            // The planes pass all by the origin.
            return new float4(n.x, n.y, n.z, 0.0f);
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static float4 MakePlane(float3 pa, float3 pb, float3 pc)
        {
            float3 v0 = pb - pa;
            float3 v1 = pc - pa;
            float3 n = cross(v0, v1);
            n = normalize(n);

            return new float4(n.x, n.y, n.z, -dot(n, pa));
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static float DistanceToPlane(float4 plane, float3 p)
        {
            return plane.x * p.x + plane.y * p.y + plane.z * p.z + plane.w;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static float SignedSq(float f)
        {
            // slower!
            //return Mathf.Sign(f) * (f * f);
            return (f < 0.0f ? -1.0f : 1.0f) * (f * f);
        }

        // Unity.Mathematics.max() function calls Single_IsNan() which significantly slow down the code (up to 20% of CullFinalLights())!
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static float min2(float a, float b)
        {
            return a < b ? a : b;
        }

        // Unity.Mathematics.min() function calls Single_IsNan() which significantly slow down the code (up to 20% of CullFinalLights())!
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static float max2(float a, float b)
        {
            return a > b ? a : b;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static float max3(float a, float b, float c)
        {
            return a > b ? (a > c ? a : c) : (b > c ? b : c);
        }

        // This is copy-pasted from Unity.Mathematics.math.f32tof16(), but use min2() function that does not check for NaN (which would consume 10% of the execution time of CullFinalLights()).
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static uint _f32tof16(float x)
        {
            const int infinity_32 = 255 << 23;
            const uint msk = 0x7FFFF000u;

            uint ux = asuint(x);
            uint uux = ux & msk;
            uint h = (uint)(asuint(min2(asfloat(uux) * 1.92592994e-34f, 260042752.0f)) + 0x1000) >> 13;   // Clamp to signed infinity if overflowed
            h = select(h, select(0x7c00u, 0x7e00u, (int)uux > infinity_32), (int)uux >= infinity_32);   // NaN->qNaN and Inf->Inf
            return h | (ux & ~msk) >> 16;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static int Align(int s, int alignment)
        {
            return ((s + alignment - 1) / alignment) * alignment;
        }
    }
}