edit chatbot and core
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
#include "token.h"
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
void Tokenizer::add(std::string word) {
|
||||
int id = wordToId.size();
|
||||
wordToId[word] = id;
|
||||
idToWord[id] = word;
|
||||
}
|
||||
|
||||
std::string Tokenizer::getWord(int id) {
|
||||
return idToWord.count(id) ? idToWord[id] : "";
|
||||
}
|
||||
|
||||
std::vector<int> Tokenizer::textToTokens(const std::string& text) {
|
||||
std::vector<int> tokens;
|
||||
size_t pos = 0;
|
||||
while (pos < text.length()) {
|
||||
int longestId = -1; size_t longestLen = 0;
|
||||
for (auto const& [word, id] : wordToId) {
|
||||
if (text.compare(pos, word.length(), word) == 0) {
|
||||
if (word.length() > longestLen) {
|
||||
longestLen = word.length(); longestId = id;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (longestId != -1) {
|
||||
tokens.push_back(longestId);
|
||||
pos += longestLen;
|
||||
} else pos++;
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
Embedder::Embedder(int vSize, int dim) {
|
||||
std::mt19937 gen(42);
|
||||
std::uniform_real_distribution<double> dist(-1.0, 1.0);
|
||||
matrix.resize(vSize, std::vector<double>(dim));
|
||||
for(int i=0; i<vSize; i++)
|
||||
for(int j=0; j<dim; j++) matrix[i][j] = dist(gen);
|
||||
}
|
||||
|
||||
std::vector<double> Embedder::get(int id) {
|
||||
if (id >= 0 && id < (int)matrix.size()) return matrix[id];
|
||||
return std::vector<double>(matrix[0].size(), 0.0);
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
#ifndef TOKEN_H
|
||||
#define TOKEN_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
class Tokenizer {
|
||||
public:
|
||||
std::map<std::string, int> wordToId;
|
||||
std::map<int, std::string> idToWord;
|
||||
|
||||
Tokenizer() {
|
||||
add("<EOS>"); // 0
|
||||
add("[SYS]"); // 1
|
||||
add("[USER]"); // 2
|
||||
add("[AI]"); // 3
|
||||
add(" "); // 4
|
||||
add("\n"); // 5
|
||||
add("привет"); // 6
|
||||
add("как"); // 7
|
||||
add("дела"); // 8
|
||||
add("?"); // 9
|
||||
add("я"); // 10
|
||||
add("робот"); // 11
|
||||
add("хорошо"); // 12
|
||||
}
|
||||
|
||||
void add(std::string word);
|
||||
int getID(std::string word);
|
||||
std::string getWord(int id);
|
||||
std::vector<int> textToTokens(const std::string& text);
|
||||
};
|
||||
|
||||
class Embedder {
|
||||
public:
|
||||
std::vector<std::vector<double>> matrix;
|
||||
Embedder(int vSize, int dim);
|
||||
std::vector<double> get(int id);
|
||||
};
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user