diff --git a/modules/meshlet/CMakeLists.txt b/modules/meshlet/CMakeLists.txt
index 385115de4ab3f6dc7c540fd77e10ddec0ee458b8..d576466d3d125d3a19640088a9b5725ac7a46b97 100644
--- a/modules/meshlet/CMakeLists.txt
+++ b/modules/meshlet/CMakeLists.txt
@@ -12,9 +12,12 @@ set(vkcv_meshlet_include ${PROJECT_SOURCE_DIR}/include)
 set(vkcv_meshlet_sources
 		${vkcv_meshlet_include}/vkcv/meshlet/Meshlet.hpp
 		${vkcv_meshlet_source}/vkcv/meshlet/Meshlet.cpp
+
 		${vkcv_meshlet_include}/vkcv/meshlet/Tipsify.hpp
 		${vkcv_meshlet_source}/vkcv/meshlet/Tipsify.cpp
-)
+
+		${vkcv_meshlet_include}/vkcv/meshlet/Forsyth.hpp
+		${vkcv_meshlet_source}/vkcv/meshlet/Forsyth.cpp)
 
 # adding source files to the module
 add_library(vkcv_meshlet STATIC ${vkcv_meshlet_sources})
diff --git a/modules/meshlet/include/vkcv/meshlet/Forsyth.hpp b/modules/meshlet/include/vkcv/meshlet/Forsyth.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f781b81ee551d5528b1b75b71b8f2e9518c013d1
--- /dev/null
+++ b/modules/meshlet/include/vkcv/meshlet/Forsyth.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "Meshlet.hpp"
+
+namespace vkcv::meshlet
+{
+    std::vector<uint32_t> forsythReorder(const std::vector<uint32_t> &idxBuf, const size_t vertexCount);
+}
diff --git a/modules/meshlet/src/vkcv/meshlet/Forsyth.cpp b/modules/meshlet/src/vkcv/meshlet/Forsyth.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0c8e3502a76d71b920efbc45a3a4ce676f68ea1
--- /dev/null
+++ b/modules/meshlet/src/vkcv/meshlet/Forsyth.cpp
@@ -0,0 +1,283 @@
+#include "vkcv/meshlet/Forsyth.hpp"
+#include <vkcv/Logger.hpp>
+#include <array>
+#include <cmath>
+#include <iostream>
+
+namespace vkcv::meshlet
+{
+    // set these to adjust performance and result quality
+    const size_t VERTEX_CACHE_SIZE = 8;
+    const size_t CACHE_FUNCTION_LENGTH = 32;
+
+    // score function constants
+    const float CACHE_DECAY_POWER = 1.5f;
+    const float LAST_TRI_SCORE = 0.75f;
+
+    const float VALENCE_BOOST_SCALE = 2.0f;
+    const float VALENCE_BOOST_POWER = 0.5f;
+
+    // sizes for precalculated tables
+    // make sure that cache score is always >= vertex_cache_size
+    const size_t CACHE_SCORE_TABLE_SIZE = 32;
+    const size_t VALENCE_SCORE_TABLE_SIZE = 32;
+
+    // precalculated tables
+    std::array<float, CACHE_SCORE_TABLE_SIZE> cachePositionScore = {};
+    std::array<float, VALENCE_SCORE_TABLE_SIZE> valenceScore = {};
+
+    void initScoreTables()
+    {
+        for(size_t i = 0; i < CACHE_SCORE_TABLE_SIZE; i++)
+        {
+            float score = 0.0f;
+            if (i < 3)
+            {
+                score = LAST_TRI_SCORE;
+            }
+            else
+            {
+                const float scaler = 1.0f / static_cast<float>(CACHE_FUNCTION_LENGTH - 3);
+                score = 1.0f - (i - 3) * scaler;
+                score = std::powf(score, CACHE_DECAY_POWER);
+            }
+            cachePositionScore[i] = score;
+        }
+
+        for(size_t i = 0; i < VALENCE_SCORE_TABLE_SIZE; i++)
+        {
+            const float valenceBoost = std::powf(i, -VALENCE_BOOST_POWER);
+            const float score = VALENCE_BOOST_SCALE * valenceBoost;
+
+            valenceScore[i] = score;
+        }
+    }
+
+    float findVertexScore(uint32_t numActiveTris, int32_t cachePos)
+    {
+        if(numActiveTris == 0)
+            return 0.0f;
+
+        float score = 0.0f;
+
+        if (cachePos >= 0)
+            score = cachePositionScore[cachePos];
+
+        if (numActiveTris < VALENCE_SCORE_TABLE_SIZE)
+            score += valenceScore[numActiveTris];
+
+        return score;
+    }
+
+    std::vector<uint32_t> forsythReorder(const std::vector<uint32_t> &idxBuf, const size_t vertexCount)
+    {
+        initScoreTables();
+        /**
+        std::cout << "CACHE POSITION SCORES:" << std::endl;
+        for(const auto element : cachePositionScore)
+            std::cout << element << std::endl;
+
+        std::cout << "VALENCE SCORES:" << std::endl;
+        for(const auto element : valenceScore)
+            std::cout << element << std::endl;
+        **/
+        const size_t triangleCount = idxBuf.size() / 3;
+
+        // per-vertex active triangle count
+        std::vector<uint8_t> numActiveTris(vertexCount, 0);
+        // iterate over indices, count total occurrences of each vertex
+        for(const auto index : idxBuf)
+        {
+            if(numActiveTris[index] == UINT8_MAX)
+            {
+                vkcv_log(LogLevel::ERROR, "Unsupported mesh.");
+                vkcv_log(LogLevel::ERROR, "Vertex shared by too many triangles.");
+                return {};
+            }
+
+            numActiveTris[index]++;
+        }
+
+
+        // allocate remaining vectors
+        std::vector<uint32_t> offsets(vertexCount, 0);
+        std::vector<float> lastScore(vertexCount, 0.0f);
+        std::vector<int8_t> cacheTag(vertexCount, -1);
+
+        std::vector<bool> triangleAdded(triangleCount, false);
+        std::vector<float> triangleScore(triangleCount, 0.0f);
+
+        std::vector<int32_t> triangleIndices(idxBuf.size(), 0);
+
+
+        // count the triangle array offset for each vertex, initialize the rest of the data.
+        // ??????????????????????????
+        uint32_t sum = 0;
+        for(size_t i = 0; i < vertexCount; i++)
+        {
+            offsets[i] = sum;
+            sum += numActiveTris[i];
+            numActiveTris[i] = 0;
+        }
+
+        // fill the vertex data structures with indices to the triangles using each vertex
+        // ??????????????????????????
+        for(size_t i = 0; i < triangleCount; i++)
+        {
+            for(size_t j = 0; j < 3; j++)
+            {
+                uint32_t v = idxBuf[3 * i + j];
+                triangleIndices[offsets[v] + numActiveTris[v]] = static_cast<int32_t>(i);
+                numActiveTris[v]++;
+            }
+        }
+
+        // init score for all vertices
+        // ??????????????????????????
+        for (size_t i = 0; i < vertexCount; i++)
+        {
+            lastScore[i] = findVertexScore(numActiveTris[i], static_cast<int32_t>(cacheTag[i]));
+
+            for(size_t j = 0; j < numActiveTris[i]; j++)
+            {
+                triangleScore[triangleIndices[offsets[i] + j]] += lastScore[i];
+            }
+        }
+
+        // find best triangle
+        int32_t bestTriangle = -1;
+        float   bestScore    = -1.0f;
+        for(size_t i = 0; i < triangleCount; i++)
+        {
+            if(triangleScore[i] > bestScore)
+            {
+                bestScore = triangleScore[i];
+                bestTriangle = static_cast<int32_t>(i);
+            }
+        }
+
+        // allocate output triangles
+        std::vector<int32_t> outTriangles(triangleCount, 0);
+        uint32_t outPos = 0;
+
+        // init cache (with -1)
+        std::array<int32_t, VERTEX_CACHE_SIZE + 3> cache = {};
+        for(auto &element : cache)
+        {
+            element = -1;
+        }
+
+        uint32_t scanPos = 0;
+
+        while(bestTriangle >= 0)
+        {
+            // mark triangle as added
+            triangleAdded[bestTriangle] = true;
+            // output triangle
+            outTriangles[outPos++] = bestTriangle;
+
+            for(size_t i = 0; i < 3; i++)
+            {
+                uint32_t v = idxBuf[3 * bestTriangle + i];
+
+                int8_t endPos = cacheTag[v];
+                if(endPos < 0)
+                    endPos = static_cast<int8_t>(VERTEX_CACHE_SIZE + i);
+
+                for(int8_t j = endPos; j > i; j--)
+                {
+                    cache[j] = cache[j - 1];
+
+                    if (cache[j] >= 0)
+                        cacheTag[cache[j]]++;
+                }
+
+                cache[i] = static_cast<int32_t>(v);
+                cacheTag[v] = static_cast<int8_t>(i);
+
+
+                for (size_t j = 0; j < numActiveTris[v]; j++)
+                {
+                    if(triangleIndices[offsets[v] + j] == bestTriangle)
+                    {
+                        triangleIndices[offsets[v] + j] = triangleIndices[offsets[v] + numActiveTris[v] - 1];
+                        break;
+                    }
+                }
+                numActiveTris[v]--;
+            }
+
+            // update scores of all triangles in cache
+            for (size_t i = 0; i < cache.size(); i++)
+            {
+                int32_t v = cache[i];
+                if (v < 0)
+                    break;
+
+                if(i >= VERTEX_CACHE_SIZE)
+                {
+                    cacheTag[v] = -1;
+                    cache[i] = -1;
+                }
+
+                float newScore = findVertexScore(numActiveTris[v], cacheTag[v]);
+                float diff = newScore - lastScore[v];
+
+                for(size_t j = 0; j < numActiveTris[v]; j++)
+                {
+                    triangleScore[triangleIndices[offsets[v] + j]] += diff;
+                }
+                lastScore[v] = newScore;
+            }
+
+            // find best triangle reference by vertices in cache
+            bestTriangle = -1;
+            bestScore = -1.0f;
+            for(size_t i = 0; i < VERTEX_CACHE_SIZE; i++)
+            {
+                if (cache[i] < 0)
+                    break;
+
+                int32_t v = cache[i];
+                for(size_t j = 0; j < numActiveTris[v]; j++)
+                {
+                    int32_t t = triangleIndices[offsets[v] + j];
+                    if(triangleScore[t] > bestScore)
+                    {
+                        bestTriangle = t;
+                        bestScore = triangleScore[t];
+                    }
+                }
+            }
+
+            // if no triangle was found at all, continue scanning whole list of triangles
+            if (bestTriangle < 0)
+            {
+                for(; scanPos < triangleCount; scanPos++)
+                {
+                    if(!triangleAdded[scanPos])
+                    {
+                        bestTriangle = scanPos;
+                        break;
+                    }
+                }
+            }
+        }
+
+
+        // convert triangle index array into full triangle list
+        std::vector<uint32_t> outIndices(idxBuf.size(), 0);
+        outPos = 0;
+        for(size_t i = 0; i < triangleCount; i++)
+        {
+            int32_t t = outTriangles[i];
+            for(size_t j = 0; j < 3; j++)
+            {
+                int32_t v = idxBuf[3 * t + j];
+                outIndices[outPos++] = static_cast<uint32_t>(v);
+            }
+        }
+
+        return outIndices;
+    }
+}
\ No newline at end of file
diff --git a/projects/mesh_shader/src/main.cpp b/projects/mesh_shader/src/main.cpp
index 99529b5305b6979ced4f113ab18c0cc79a53a4e4..7d3c785e1ace0c4bf866818744ab9417d0ab8cdd 100644
--- a/projects/mesh_shader/src/main.cpp
+++ b/projects/mesh_shader/src/main.cpp
@@ -9,6 +9,7 @@
 #include <vkcv/asset/asset_loader.hpp>
 #include <vkcv/meshlet/Meshlet.hpp>
 #include <vkcv/meshlet/Tipsify.hpp>
+#include <vkcv/meshlet/Forsyth.hpp>
 
 struct Plane {
 	glm::vec3 pointOnPlane;
@@ -139,7 +140,8 @@ int main(int argc, const char** argv) {
 	// mesh shader buffers
 	const auto& assetLoaderIndexBuffer              = mesh.vertexGroups[0].indexBuffer;
 	std::vector<uint32_t> indexBuffer32Bit          = vkcv::meshlet::assetLoaderIndicesTo32BitIndices(assetLoaderIndexBuffer.data, assetLoaderIndexBuffer.type);
-    std::vector<uint32_t> reorderedIndexBuffer32Bit = vkcv::meshlet::tipsifyMesh(indexBuffer32Bit, interleavedVertices.size());
+    //std::vector<uint32_t> reorderedIndexBuffer32Bit = vkcv::meshlet::tipsifyMesh(indexBuffer32Bit, interleavedVertices.size());
+    std::vector<uint32_t> reorderedIndexBuffer32Bit = vkcv::meshlet::forsythReorder(indexBuffer32Bit, interleavedVertices.size());
 
     const auto meshShaderModelData = createMeshShaderModelData(interleavedVertices, reorderedIndexBuffer32Bit);