Files
BiPy/Xenith/core.cpp
T
2026-05-03 21:02:34 +07:00

251 lines
13 KiB
C++

#include "core.hpp"
#include "token/token.hpp"
#include <cmath>
#include <iostream>
#include <chrono>
#include <string.h>
#include <omp.h>
#include <fstream>
NeuralNetwork::NeuralNetwork(LayerStructure_t layers[], int count, bool useVulkanParam) {
this->numLayers = count;
this->useVulkan = useVulkanParam;
uint32_t curW = 0, curB = 0, curO = 0;
for (int i = 0; i < count; i++) {
sizes.push_back(layers[i].size);
oOff.push_back(curO);
curO += layers[i].size;
}
for (int i = 0; i < count - 1; i++) {
wOff.push_back(curW);
bOff.push_back(curB);
int wCount = sizes[i] * sizes[i+1];
float scale = sqrt(2.0f / sizes[i]);
for (int j = 0; j < wCount; j++) {
h_weights.push_back(((float)rand() / RAND_MAX * 2.0f - 1.0f) * scale);
}
for (int j = 0; j < sizes[i+1]; j++) h_biases.push_back(0.0f);
curW += wCount;
curB += sizes[i+1];
}
h_outputs.resize(curO, 0.0f);
h_errors.resize(curO, 0.0f);
if (this->useVulkan) {
initVulkan();
if (this->useVulkan) {
initVulkanResources();
syncToGPU();
}
}
}
void NeuralNetwork::initVulkan() {
try {
vk::ApplicationInfo app{"Xenith", 1, nullptr, 0, VK_API_VERSION_1_1};
instance = vk::createInstance({{}, &app});
auto pdevs = instance.enumeratePhysicalDevices();
if (pdevs.empty()) throw std::runtime_error("GPU not found");
physDev = pdevs[0];
auto props = physDev.getQueueFamilyProperties();
computeQueueFamilyIndex = -1;
for (uint32_t i = 0; i < props.size(); i++) {
if (props[i].queueFlags & vk::QueueFlagBits::eCompute) {
computeQueueFamilyIndex = i; break;
}
}
vk::DeviceQueueCreateInfo qinfo({}, (uint32_t)computeQueueFamilyIndex, 1, new float{1.0f});
device = physDev.createDevice({{}, 1, &qinfo});
queue = device.getQueue(computeQueueFamilyIndex, 0);
cmdPool = device.createCommandPool({{}, (uint32_t)computeQueueFamilyIndex});
} catch (...) { useVulkan = false; }
}
void NeuralNetwork::initVulkanResources() {
auto createBuf = [&](size_t sz, vk::Buffer& b, vk::DeviceMemory& m, void** ptr) {
b = device.createBuffer({{}, sz * sizeof(float), vk::BufferUsageFlagBits::eStorageBuffer});
auto req = device.getBufferMemoryRequirements(b);
m = device.allocateMemory({req.size, findMemoryType(req.memoryTypeBits, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent)});
device.bindBufferMemory(b, m, 0);
*ptr = device.mapMemory(m, 0, sz * sizeof(float));
};
createBuf(h_weights.size(), gpuW, memW, &pW);
createBuf(h_biases.size(), gpuB, memB, &pB);
createBuf(h_outputs.size(), gpuO, memO, &pO);
createBuf(h_errors.size(), gpuE, memE, &pE);
createBuf(sizes.back(), gpuT, memT, &pT);
std::vector<vk::DescriptorSetLayoutBinding> binds = {
{0, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{1, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{2, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{3, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
{4, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}
};
dsLayout = device.createDescriptorSetLayout({{}, (uint32_t)binds.size(), binds.data()});
vk::DescriptorPoolSize ps(vk::DescriptorType::eStorageBuffer, 5);
descriptorPool = device.createDescriptorPool({{}, 1, 1, &ps});
descriptorSet = device.allocateDescriptorSets({descriptorPool, 1, &dsLayout})[0];
vk::DescriptorBufferInfo bW(gpuW, 0, VK_WHOLE_SIZE), bB(gpuB, 0, VK_WHOLE_SIZE), bO(gpuO, 0, VK_WHOLE_SIZE), bE(gpuE, 0, VK_WHOLE_SIZE), bT(gpuT, 0, VK_WHOLE_SIZE);
device.updateDescriptorSets({
{descriptorSet, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bW},
{descriptorSet, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bB},
{descriptorSet, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bO},
{descriptorSet, 3, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bE},
{descriptorSet, 4, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &bT}
}, {});
auto code = readFile("Xenith/shader.comp.spv");
shaderModule = device.createShaderModule({{}, code.size(), (uint32_t*)code.data()});
vk::PushConstantRange pr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams));
pipeLayout = device.createPipelineLayout({{}, 1, &dsLayout, 1, &pr});
pipeline = device.createComputePipeline(nullptr, {{}, {{}, vk::ShaderStageFlagBits::eCompute, shaderModule, "main"}, pipeLayout}).value;
}
double NeuralNetwork::train(const std::vector<double>& input, const std::vector<double>& target, double lr) {
if (!useVulkan) return runTrainCPU(input, target, lr);
float* fIn = (float*)pO; for(size_t i=0; i<input.size(); i++) fIn[i] = (float)input[i];
float* fTar = (float*)pT; for(size_t i=0; i<target.size(); i++) fTar[i] = (float)target[i];
vk::CommandBufferAllocateInfo ai(cmdPool, vk::CommandBufferLevel::ePrimary, 1);
vk::CommandBuffer cmd = device.allocateCommandBuffers(ai)[0];
cmd.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
cmd.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeLayout, 0, {descriptorSet}, {});
vk::MemoryBarrier barrier(vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eShaderRead);
for (int i = 0; i < numLayers - 1; i++) {
TrainParams p = {0, (uint32_t)sizes[i], (uint32_t)sizes[i+1], wOff[i], bOff[i], oOff[i], oOff[i+1], (float)lr};
cmd.pushConstants(pipeLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams), &p);
cmd.dispatch((sizes[i+1] + 255) / 256, 1, 1);
cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, {}, {barrier}, {}, {});
}
{
TrainParams p = {1, 0, (uint32_t)sizes.back(), 0, 0, 0, oOff.back(), (float)lr};
cmd.pushConstants(pipeLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams), &p);
cmd.dispatch((sizes.back() + 255) / 256, 1, 1);
cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, {}, {barrier}, {}, {});
}
for (int i = numLayers - 2; i > 0; i--) {
TrainParams p = {2, (uint32_t)sizes[i], (uint32_t)sizes[i+1], wOff[i], bOff[i], oOff[i], oOff[i+1], (float)lr};
cmd.pushConstants(pipeLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams), &p);
cmd.dispatch((sizes[i] + 255) / 256, 1, 1);
cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, {}, {barrier}, {}, {});
}
for (int i = 0; i < numLayers - 1; i++) {
TrainParams p = {3, (uint32_t)sizes[i], (uint32_t)sizes[i+1], wOff[i], bOff[i], oOff[i], oOff[i+1], (float)lr};
cmd.pushConstants(pipeLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams), &p);
cmd.dispatch((sizes[i+1] + 255) / 256, 1, 1);
}
cmd.end();
queue.submit(vk::SubmitInfo(0, nullptr, nullptr, 1, &cmd), nullptr);
queue.waitIdle();
device.freeCommandBuffers(cmdPool, cmd);
double mse = 0;
float* out = (float*)pO + oOff.back();
for (int i = 0; i < sizes.back(); i++) { double d = (double)target[i] - (double)out[i]; mse += d * d; }
return mse / sizes.back();
}
void NeuralNetwork::trainOnSequence(Tokenizer& tok, Embedder& emb, const std::string& dataset,
int epochs, double lr,
std::function<std::vector<double>(const std::vector<int>&, Embedder&)> buildInput,
std::function<void(const TrainStatus&)> onProgress) {
std::vector<int> tokens = tok.textToTokens(dataset);
if (tokens.size() < 2) return;
int clrId = -1;
auto search = tok.textToTokens("[CLR]"); if(!search.empty()) clrId = search[0];
auto startTime = std::chrono::high_resolution_clock::now();
long long totalSteps = (long long)epochs * (tokens.size() - 1);
long long currentGlobalStep = 0;
double lastEpochLoss = 0;
long long totalParamsCount = 0;
for (int i = 0; i < numLayers - 1; i++) {
totalParamsCount += (long long)sizes[i] * sizes[i+1]; // веса
totalParamsCount += (long long)sizes[i+1]; // смещения
}
for (int e = 1; e <= epochs; e++) {
double currentEpochLoss = 0;
std::vector<int> slidingContext;
for (size_t i = 0; i < tokens.size(); i++) {
if (tokens[i] == clrId) { slidingContext.clear(); continue; }
if (!slidingContext.empty()) {
std::vector<double> target(MAX_VOCAB, 0.0);
target[tokens[i]] = 1.0;
double loss = this->train(buildInput(slidingContext, emb), target, lr);
currentEpochLoss += loss;
currentGlobalStep++;
if (onProgress && currentGlobalStep % 10 == 0) {
auto now = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration<double>(now - startTime).count();
double speed = currentGlobalStep / elapsed;
TrainStatus status = {e, epochs, (int)i, (int)tokens.size(), loss, currentEpochLoss, lastEpochLoss, speed, (totalSteps - currentGlobalStep) / speed, (float)currentGlobalStep / totalSteps * 100.0f, totalParamsCount};
onProgress(status);
}
}
slidingContext.push_back(tokens[i]);
if (slidingContext.size() > MAX_CONTEXT) slidingContext.erase(slidingContext.begin());
}
lastEpochLoss = currentEpochLoss;
}
if (useVulkan) syncToCPU();
}
std::vector<double> NeuralNetwork::feedForward(const std::vector<double>& input) {
if (useVulkan) {
float* fIn = (float*)pO; for(size_t i=0; i<input.size(); i++) fIn[i] = (float)input[i];
vk::CommandBufferAllocateInfo ai(cmdPool, vk::CommandBufferLevel::ePrimary, 1);
vk::CommandBuffer cmd = device.allocateCommandBuffers(ai)[0];
cmd.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
cmd.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeLayout, 0, {descriptorSet}, {});
vk::MemoryBarrier b(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
for (int i = 0; i < numLayers - 1; i++) {
TrainParams p = {0, (uint32_t)sizes[i], (uint32_t)sizes[i+1], wOff[i], bOff[i], oOff[i], oOff[i+1], 0.0f};
cmd.pushConstants(pipeLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(TrainParams), &p);
cmd.dispatch((sizes[i+1] + 255) / 256, 1, 1);
cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, {}, {b}, {}, {});
}
cmd.end();
queue.submit(vk::SubmitInfo(0, nullptr, nullptr, 1, &cmd), nullptr);
queue.waitIdle();
device.freeCommandBuffers(cmdPool, cmd);
std::vector<double> res(sizes.back());
float* out = (float*)pO + oOff.back();
for(int i=0; i<sizes.back(); i++) res[i] = (double)out[i];
return res;
}
return std::vector<double>(sizes.back(), 0.0);
}
void NeuralNetwork::syncToCPU() { if(useVulkan) { memcpy(h_weights.data(), pW, h_weights.size()*4); memcpy(h_biases.data(), pB, h_biases.size()*4); } }
void NeuralNetwork::syncToGPU() { if(useVulkan) { memcpy(pW, h_weights.data(), h_weights.size()*4); memcpy(pB, h_biases.data(), h_biases.size()*4); } }
uint32_t NeuralNetwork::findMemoryType(uint32_t f, vk::MemoryPropertyFlags p) {
auto m = physDev.getMemoryProperties();
for(uint32_t i=0; i<m.memoryTypeCount; i++) if((f&(1<<i)) && (m.memoryTypes[i].propertyFlags&p)==p) return i;
return 0;
}
std::vector<char> NeuralNetwork::readFile(const std::string& n) {
std::ifstream f(n, std::ios::ate|std::ios::binary);
size_t s = (size_t)f.tellg(); std::vector<char> b(s); f.seekg(0); f.read(b.data(), s); return b;
}
NeuralNetwork::~NeuralNetwork() {
if (useVulkan) {
device.waitIdle();
device.destroyPipeline(pipeline); device.destroyPipelineLayout(pipeLayout); device.destroyShaderModule(shaderModule);
device.destroyBuffer(gpuW); device.freeMemory(memW); device.destroyBuffer(gpuB); device.freeMemory(memB);
device.destroyBuffer(gpuO); device.freeMemory(memO); device.destroyBuffer(gpuE); device.freeMemory(memE); device.destroyBuffer(gpuT); device.freeMemory(memT);
device.destroyDescriptorPool(descriptorPool); device.destroyDescriptorSetLayout(dsLayout); device.destroyCommandPool(cmdPool);
device.destroy(); instance.destroy();
}
}
double NeuralNetwork::runTrainCPU(const std::vector<double>& i, const std::vector<double>& t, double l) { return 0.0; }