#include "token.h" #include #include void Tokenizer::add(std::string word) { int id = wordToId.size(); wordToId[word] = id; idToWord[id] = word; } std::string Tokenizer::getWord(int id) { return idToWord.count(id) ? idToWord[id] : ""; } std::vector Tokenizer::textToTokens(const std::string& text) { std::vector tokens; size_t pos = 0; while (pos < text.length()) { int longestId = -1; size_t longestLen = 0; for (auto const& [word, id] : wordToId) { if (text.compare(pos, word.length(), word) == 0) { if (word.length() > longestLen) { longestLen = word.length(); longestId = id; } } } if (longestId != -1) { tokens.push_back(longestId); pos += longestLen; } else pos++; } return tokens; } Embedder::Embedder(int vSize, int dim) { std::mt19937 gen(42); std::uniform_real_distribution dist(-1.0, 1.0); matrix.resize(vSize, std::vector(dim)); for(int i=0; i Embedder::get(int id) { if (id >= 0 && id < (int)matrix.size()) return matrix[id]; return std::vector(matrix[0].size(), 0.0); }