First Vulkan Relise

This commit is contained in:
2026-05-01 01:33:53 +07:00
parent e214ce10ed
commit 05875d8aa2
8 changed files with 300 additions and 352 deletions
+208 -172
View File
@@ -2,56 +2,55 @@
#include <cmath>
#include <cstdlib>
#include <omp.h>
#include <vulkan/vulkan.h>
#include <vulkan/vulkan.hpp>
#include <iostream>
#include <vector>
#include <chrono>
#include <fstream>
#include <string.h>
// --- КОНСТРУКТОР ---
NeuralNetwork::NeuralNetwork(LayerStructure_t layers[], int count, bool useVulkanParam) {
this->numLayers = count;
this->useVulkan = useVulkanParam;
this->vulkanResourcesInitialized = false;
NeuralNetwork::NeuralNetwork(LayerStructure_t layers[], int count, bool useVulkan) : numLayers(count) {
if (useVulkan) {
vk::ApplicationInfo appInfo{"Xenith", 1, nullptr, 0, VK_API_VERSION_1_1};
instance = vk::createInstance({{}, &appInfo});
// 2. Выбор видеокарты
auto physicalDevices = instance.enumeratePhysicalDevices();
if (physicalDevices.empty()) throw std::runtime_error("GPU с поддержкой Vulkan не найдены!");
physDev = physicalDevices[0];
std::cout << "Используем GPU: " << physDev.getProperties().deviceName << std::endl;
// 3. Поиск очереди для вычислений (Compute)
auto queueProps = physDev.getQueueFamilyProperties();
int computeFamily = -1;
for (int i = 0; i < queueProps.size(); i++) {
if (queueProps[i].queueFlags & vk::QueueFlagBits::eCompute) {
computeFamily = i;
break;
if (this->useVulkan) {
try {
vk::ApplicationInfo appInfo{"Xenith", 1, nullptr, 0, VK_API_VERSION_1_1};
instance = vk::createInstance({{}, &appInfo});
auto physicalDevices = instance.enumeratePhysicalDevices();
if (physicalDevices.empty()) throw std::runtime_error("GPU не найдены");
physDev = physicalDevices[0];
auto queueProps = physDev.getQueueFamilyProperties();
int computeFamily = -1;
for (int i = 0; i < (int)queueProps.size(); i++) {
if (queueProps[i].queueFlags & vk::QueueFlagBits::eCompute) {
computeFamily = i; break;
}
}
if (computeFamily == -1) throw std::runtime_error("Compute не поддерживается");
this->computeQueueFamilyIndex = (uint32_t)computeFamily;
float priority = 1.0f;
vk::DeviceQueueCreateInfo queueInfo({}, computeQueueFamilyIndex, 1, &priority);
vk::DeviceCreateInfo deviceCreateInfo({}, 1, &queueInfo);
device = physDev.createDevice(deviceCreateInfo);
queue = device.getQueue(computeQueueFamilyIndex, 0);
vk::CommandPoolCreateInfo poolInfo({}, computeQueueFamilyIndex);
cmdPool = device.createCommandPool(poolInfo);
std::cout << "Vulkan инициализирован на: " << physDev.getProperties().deviceName << std::endl;
} catch (const std::exception& e) {
std::cerr << "Ошибка Vulkan: " << e.what() << ". Переключение на CPU." << std::endl;
this->useVulkan = false;
}
if (computeFamily == -1) throw std::runtime_error("GPU не поддерживает вычисления (Compute)");
// ВАЖНО: Сохраняем индекс в переменную класса, чтобы использовать её везде
this->computeQueueFamilyIndex = (uint32_t)computeFamily;
// 4. Создание логического устройства
float priority = 1.0f;
vk::DeviceQueueCreateInfo queueInfo({}, computeQueueFamilyIndex, 1, &priority);
vk::DeviceCreateInfo deviceCreateInfo({}, 1, &queueInfo);
device = physDev.createDevice(deviceCreateInfo);
// 5. Получаем саму очередь
queue = device.getQueue(computeQueueFamilyIndex, 0);
// 6. Создаем пул команд (теперь используем правильный индекс)
vk::CommandPoolCreateInfo poolInfo({}, computeQueueFamilyIndex);
cmdPool = device.createCommandPool(poolInfo);
}
// Инициализация CPU данных
for (int i = 0; i < count; i++) sizes.push_back(layers[i].size);
for (int i = 0; i < count - 1; i++) {
std::vector<std::vector<double>> layerW;
@@ -65,14 +64,173 @@ NeuralNetwork::NeuralNetwork(LayerStructure_t layers[], int count, bool useVulka
weights.push_back(layerW);
biases.push_back(std::vector<double>(sizes[i+1], 0.0));
}
if (this->useVulkan) {
initVulkanResources();
}
}
// --- ДЕСТРУКТОР ---
NeuralNetwork::~NeuralNetwork() {
if (useVulkan && vulkanResourcesInitialized) {
device.waitIdle();
device.destroyPipeline(pipeline);
device.destroyPipelineLayout(pipeLayout);
device.destroyShaderModule(shaderModule);
device.destroyDescriptorPool(descriptorPool);
device.destroyDescriptorSetLayout(dsLayout);
device.destroyBuffer(gpuW); device.freeMemory(memW);
device.destroyBuffer(gpuB); device.freeMemory(memB);
device.destroyBuffer(gpuO); device.freeMemory(memO);
device.destroyBuffer(gpuE); device.freeMemory(memE);
device.destroyCommandPool(cmdPool);
device.destroy();
instance.destroy();
}
}
// --- ИНИЦИАЛИЗАЦИЯ РЕСУРСОВ GPU ---
void NeuralNetwork::initVulkanResources() {
if (!useVulkan || vulkanResourcesInitialized) return;
size_t wSize = 0, bSize = 0, oSize = 0;
for (int i = 0; i < numLayers - 1; i++) {
wSize += (size_t)sizes[i] * sizes[i+1];
bSize += (size_t)sizes[i+1];
}
for (int s : sizes) oSize += s;
auto createBuf = [&](size_t size, vk::Buffer& buf, vk::DeviceMemory& mem) {
buf = device.createBuffer({{}, size * sizeof(float), vk::BufferUsageFlagBits::eStorageBuffer});
vk::MemoryRequirements req = device.getBufferMemoryRequirements(buf);
mem = device.allocateMemory({req.size, findMemoryType(req.memoryTypeBits, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent)});
device.bindBufferMemory(buf, mem, 0);
};
createBuf(wSize, gpuW, memW);
createBuf(bSize, gpuB, memB);
createBuf(oSize, gpuO, memO);
createBuf(oSize, gpuE, memE);
std::vector<vk::DescriptorSetLayoutBinding> bindings = {
{0, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{1, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{2, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{3, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}
};
dsLayout = device.createDescriptorSetLayout({{}, (uint32_t)bindings.size(), bindings.data()});
vk::DescriptorPoolSize poolSize(vk::DescriptorType::eStorageBuffer, 4);
descriptorPool = device.createDescriptorPool({{}, 1, 1, &poolSize});
descriptorSet = device.allocateDescriptorSets({descriptorPool, 1, &dsLayout})[0];
vk::DescriptorBufferInfo bW(gpuW, 0, VK_WHOLE_SIZE), bB(gpuB, 0, VK_WHOLE_SIZE), bO(gpuO, 0, VK_WHOLE_SIZE), bE(gpuE, 0, VK_WHOLE_SIZE);
device.updateDescriptorSets({{descriptorSet, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bW},
{descriptorSet, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bB},
{descriptorSet, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bO},
{descriptorSet, 3, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bE}}, {});
auto shaderCode = readFile("Xenith/shader.comp.spv");
shaderModule = device.createShaderModule({{}, shaderCode.size(), (uint32_t*)shaderCode.data()});
vk::PushConstantRange pushRange(vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams));
pipeLayout = device.createPipelineLayout({{}, 1, &dsLayout, 1, &pushRange});
vk::PipelineShaderStageCreateInfo stageInfo({}, vk::ShaderStageFlagBits::eCompute, shaderModule, "main");
pipeline = device.createComputePipeline(nullptr, {{}, stageInfo, pipeLayout}).value;
vulkanResourcesInitialized = true;
}
// --- ОБУЧЕНИЕ VULKAN ---
double NeuralNetwork::trainVulkan(const std::vector<double>& input, const std::vector<double>& target, double lr) {
if (!useVulkan) return train(input, target, lr);
if (!vulkanResourcesInitialized) initVulkanResources();
std::vector<double> pred = feedForward(input);
std::vector<std::vector<double>> errors(numLayers);
errors[numLayers - 1].resize(sizes[numLayers - 1]);
double totalErr = 0;
for (int i = 0; i < sizes[numLayers - 1]; i++) {
double e = target[i] - pred[i];
errors[numLayers - 1][i] = e * pred[i] * (1.0 - pred[i]);
totalErr += e * e;
}
for (int i = numLayers - 2; i > 0; i--) {
errors[i].resize(sizes[i]);
for (int j = 0; j < sizes[i]; j++) {
double e = 0;
for (int k = 0; k < sizes[i + 1]; k++) e += errors[i + 1][k] * weights[i][k][j];
errors[i][j] = e * outputs[i][j] * (1.0 - outputs[i][j]);
}
}
std::vector<float> fW, fB, fO, fE;
std::vector<uint32_t> wOff, bOff, oOff;
for (int i = 0; i < numLayers - 1; i++) {
wOff.push_back(fW.size());
for (auto& row : weights[i]) for (double v : row) fW.push_back((float)v);
bOff.push_back(fB.size());
for (double v : biases[i]) fB.push_back((float)v);
}
for (int i = 0; i < numLayers; i++) {
oOff.push_back(fO.size());
for (double v : outputs[i]) fO.push_back((float)v);
for (double v : errors[i]) fE.push_back((float)v);
}
auto upload = [&](vk::DeviceMemory mem, void* data, size_t size) {
if (size == 0) return;
void* mapped = device.mapMemory(mem, 0, size);
memcpy(mapped, data, size);
device.unmapMemory(mem);
};
upload(memW, fW.data(), fW.size() * sizeof(float));
upload(memB, fB.data(), fB.size() * sizeof(float));
upload(memO, fO.data(), fO.size() * sizeof(float));
upload(memE, fE.data(), fE.size() * sizeof(float));
vk::CommandBufferAllocateInfo allocInfo(cmdPool, vk::CommandBufferLevel::ePrimary, 1);
vk::CommandBuffer cmd = device.allocateCommandBuffers(allocInfo)[0];
cmd.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
cmd.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeLayout, 0, {descriptorSet}, {});
for (int i = 0; i < numLayers - 1; i++) {
TrainParams p = {(uint32_t)sizes[i], (uint32_t)sizes[i+1], wOff[i], bOff[i], oOff[i], oOff[i+1], (float)lr};
cmd.pushConstants(pipeLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams), &p);
cmd.dispatch((sizes[i+1] + 255) / 256, 1, 1);
}
cmd.end();
queue.submit(vk::SubmitInfo(0, nullptr, nullptr, 1, &cmd), nullptr);
queue.waitIdle();
device.freeCommandBuffers(cmdPool, cmd);
void* wPtr = device.mapMemory(memW, 0, fW.size() * sizeof(float));
memcpy(fW.data(), wPtr, fW.size() * sizeof(float));
device.unmapMemory(memW);
void* bPtr = device.mapMemory(memB, 0, fB.size() * sizeof(float));
memcpy(fB.data(), bPtr, fB.size() * sizeof(float));
device.unmapMemory(memB);
int wi = 0, bi = 0;
for (int i = 0; i < numLayers - 1; i++) {
for (int j = 0; j < sizes[i+1]; j++) {
for (int k = 0; k < sizes[i]; k++) weights[i][j][k] = fW[wi++];
biases[i][j] = fB[bi++];
}
}
return totalErr;
}
// --- ВСПОМОГАТЕЛЬНЫЕ ФУНКЦИИ ---
std::vector<double> NeuralNetwork::feedForward(const std::vector<double>& input) {
outputs.clear();
outputs.push_back(input);
std::vector<double> curr = input;
for (int i = 0; i < numLayers - 1; i++) {
std::vector<double> next;
for (int j = 0; j < sizes[i+1]; j++) {
@@ -86,170 +244,48 @@ std::vector<double> NeuralNetwork::feedForward(const std::vector<double>& input)
return curr;
}
double NeuralNetwork::train(const std::vector<double>& input, const std::vector<double>& target, double lr) {
omp_set_num_threads(cpu_count);
std::vector<double> pred = feedForward(input);
std::vector<std::vector<double>> errors(numLayers);
errors[numLayers - 1].resize(sizes[numLayers - 1]);
double totalErr = 0;
for (int i = 0; i < sizes[numLayers - 1]; i++) {
double e = target[i] - pred[i];
errors[numLayers - 1][i] = e * pred[i] * (1.0 - pred[i]);
totalErr += e * e;
}
for (int i = numLayers - 2; i > 0; i--) {
errors[i].resize(sizes[i]);
#pragma omp parallel for
for (int j = 0; j < sizes[i]; j++) {
double e = 0;
for (int k = 0; k < sizes[i + 1]; k++) {
e += errors[i + 1][k] * weights[i][k][j];
}
for (int k = 0; k < sizes[i + 1]; k++) e += errors[i + 1][k] * weights[i][k][j];
errors[i][j] = e * outputs[i][j] * (1.0 - outputs[i][j]);
}
}
for (int i = 0; i < numLayers - 1; i++) {
#pragma omp parallel for
for (int j = 0; j < sizes[i + 1]; j++) {
double errorTerm = lr * errors[i + 1][j];
// Вложенный цикл обновления весов
for (int k = 0; k < sizes[i]; k++) {
weights[i][j][k] += errorTerm * outputs[i][k];
}
biases[i][j] += errorTerm;
double errT = lr * errors[i + 1][j];
for (int k = 0; k < sizes[i]; k++) weights[i][j][k] += errT * outputs[i][k];
biases[i][j] += errT;
}
}
return totalErr;
}
uint32_t NeuralNetwork::findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) {
vk::PhysicalDeviceMemoryProperties memProperties = physDev.getMemoryProperties();
for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
return i;
}
if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) return i;
}
throw std::runtime_error("Не удалось найти подходящий тип памяти!");
throw std::runtime_error("Память не найдена");
}
// Внутри класса NeuralNetwork в секции private:
std::vector<char> NeuralNetwork::readFile(const std::string& filename) {
std::ifstream file(filename, std::ios::ate | std::ios::binary);
if (!file.is_open()) {
throw std::runtime_error("Не удалось открыть файл шейдера: " + filename);
}
if (!file.is_open()) throw std::runtime_error("Файл не найден: " + filename);
size_t fileSize = (size_t)file.tellg();
std::vector<char> buffer(fileSize);
file.seekg(0);
file.read(buffer.data(), fileSize);
file.close();
return buffer;
}
double NeuralNetwork::trainVulkan() {
// 1. Создание буферов
vk::Buffer inputBuffer = device.createBuffer({{}, sizeof(float) * 2, vk::BufferUsageFlagBits::eStorageBuffer});
vk::Buffer outputBuffer = device.createBuffer({{}, sizeof(float), vk::BufferUsageFlagBits::eStorageBuffer});
// 2. Выделение и привязка памяти для ВХОДА
vk::MemoryRequirements inReq = device.getBufferMemoryRequirements(inputBuffer);
vk::DeviceMemory inputMemory = device.allocateMemory({
inReq.size,
findMemoryType(inReq.memoryTypeBits, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent)
});
device.bindBufferMemory(inputBuffer, inputMemory, 0); // КРИТИЧНО: привязываем память к буферу
// 3. Копирование данных во входной буфер
float inputData[2] = {2.51f, 2.32f};
void* pIn = device.mapMemory(inputMemory, 0, sizeof(float) * 2);
memcpy(pIn, inputData, sizeof(float) * 2);
device.unmapMemory(inputMemory);
// 4. Выделение и привязка памяти для ВЫХОДА
vk::MemoryRequirements outReq = device.getBufferMemoryRequirements(outputBuffer);
vk::DeviceMemory outputMemory = device.allocateMemory({
outReq.size,
findMemoryType(outReq.memoryTypeBits, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent)
});
device.bindBufferMemory(outputBuffer, outputMemory, 0);
// 5. ДЕСКРИПТОРЫ (Связь C++ -> Шейдер)
// Описываем, что у нас есть 2 слота (binding 0 и 1)
std::vector<vk::DescriptorSetLayoutBinding> bindings = {
{0, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{1, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}
};
vk::DescriptorSetLayout dsLayout = device.createDescriptorSetLayout({{}, (uint32_t)bindings.size(), bindings.data()});
// Создаем пул и выделяем сет дескрипторов
vk::DescriptorPoolSize poolSize{vk::DescriptorType::eStorageBuffer, 2};
vk::DescriptorPool pool = device.createDescriptorPool({{}, 1, 1, &poolSize});
vk::DescriptorSet ds = device.allocateDescriptorSets({pool, 1, &dsLayout})[0];
// Указываем, какие именно буферы в какие слоты вставить
vk::DescriptorBufferInfo bInInfo{inputBuffer, 0, VK_WHOLE_SIZE};
vk::DescriptorBufferInfo bOutInfo{outputBuffer, 0, VK_WHOLE_SIZE};
device.updateDescriptorSets({
{ds, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bInInfo},
{ds, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bOutInfo}
}, {});
// 6. ПАЙПЛАЙН (Загрузка шейдера)
auto shaderCode = readFile("Xenith/shader.comp.spv"); // Твоя функция чтения файла
vk::ShaderModule shaderModule = device.createShaderModule({{}, shaderCode.size(), (uint32_t*)shaderCode.data()});
vk::PipelineLayout pipeLayout = device.createPipelineLayout({{}, 1, &dsLayout});
vk::ComputePipelineCreateInfo pipeInfo{{}, {{}, vk::ShaderStageFlagBits::eCompute, shaderModule, "main"}, pipeLayout};
vk::Pipeline pipeline = device.createComputePipeline(nullptr, pipeInfo).value;
// 7. КОМАНДЫ И ЗАПУСК (Command Buffer)
// (Предполагаем, что cmdPool и queue уже созданы в классе)
vk::CommandBufferAllocateInfo cmdAllocInfo(cmdPool, vk::CommandBufferLevel::ePrimary, 1);
vk::CommandBuffer cmd = device.allocateCommandBuffers(cmdAllocInfo)[0];
cmd.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
cmd.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeLayout, 0, {ds}, {});
cmd.dispatch(1, 1, 1); // Запускаем 1 поток
cmd.end();
queue.submit(vk::SubmitInfo(0, nullptr, nullptr, 1, &cmd), nullptr);
queue.waitIdle();
// 8. ЗАБИРАЕМ РЕЗУЛЬТАТ
float result = 0;
void* pOut = device.mapMemory(outputMemory, 0, sizeof(float));
memcpy(&result, pOut, sizeof(float));
device.unmapMemory(outputMemory);
// Очистка (в реальном коде лучше делать в деструкторе)
device.destroyPipeline(pipeline);
device.destroyPipelineLayout(pipeLayout);
device.destroyShaderModule(shaderModule);
device.destroyDescriptorPool(pool);
device.destroyDescriptorSetLayout(dsLayout);
device.destroyBuffer(inputBuffer); device.freeMemory(inputMemory);
device.destroyBuffer(outputBuffer); device.freeMemory(outputMemory);
return (double)result;
}
NeuralNetwork::~NeuralNetwork() {
// Здесь позже мы добавим удаление vkInstance, vkDevice и прочего,
// чтобы не было утечек памяти на видеокарте.
}
}
+25 -2
View File
@@ -36,6 +36,30 @@ private:
double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); }
double sigmoidDeriv(double x) { return x * (1.0 - x); }
struct TrainParams {
uint32_t prevLayerSize;
uint32_t nextLayerSize;
uint32_t weightOffset;
uint32_t biasOffset;
uint32_t outOffset;
uint32_t errOffset;
float lr;
};
vk::Buffer gpuW, gpuB, gpuO, gpuE;
vk::DeviceMemory memW, memB, memO, memE;
vk::DescriptorPool descriptorPool;
vk::DescriptorSet descriptorSet;
vk::DescriptorSetLayout dsLayout;
vk::PipelineLayout pipeLayout;
vk::Pipeline pipeline;
vk::ShaderModule shaderModule;
bool vulkanResourcesInitialized = false;
void initVulkanResources(); // Метод для разовой инициализации
void cleanupVulkanResources();
public:
int cpu_count = 1;
@@ -49,8 +73,7 @@ public:
std::vector<double> feedForward(const std::vector<double>& input);
double train(const std::vector<double>& input, const std::vector<double>& target, double lr);
// Наш тест Vulkan
double trainVulkan();
double trainVulkan(const std::vector<double>& input, const std::vector<double>& target, double lr);
};
#endif
+30 -10
View File
@@ -1,16 +1,36 @@
#version 450
layout(local_size_x = 1) in; // Запускаем 1 поток
layout(local_size_x = 256) in;
layout(std430, binding = 0) buffer InputBuffer {
float a;
float b;
} inputs;
layout(std430, binding = 0) buffer WeightBuffer { float weights[]; };
layout(std430, binding = 1) buffer BiasBuffer { float biases[]; };
layout(std430, binding = 2) buffer OutputBuffer { float outputs[]; };
layout(std430, binding = 3) buffer ErrorBuffer { float errors[]; };
layout(std430, binding = 1) buffer OutputBuffer {
float result;
} outputs;
layout(push_constant) uniform Params {
uint prevLayerSize;
uint nextLayerSize;
uint weightOffset;
uint biasOffset;
uint outOffset;
uint errOffset;
float lr;
} p;
void main() {
outputs.result = inputs.a * inputs.b;
}
uint j = gl_GlobalInvocationID.x; // Индекс нейрона следующего слоя
if (j < p.nextLayerSize) {
float errorTerm = p.lr * errors[p.errOffset + j];
// Обновляем веса, входящие в этот нейрон
for (uint k = 0; k < p.prevLayerSize; k++) {
uint wIdx = p.weightOffset + (j * p.prevLayerSize + k);
uint outIdx = p.outOffset + k;
weights[wIdx] += errorTerm * outputs[outIdx];
}
// Обновляем биас этого нейрона
biases[p.biasOffset + j] += errorTerm;
}
}
Binary file not shown.