From 361891f25ca735fd85db64a14823cc55b8a0619a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roman=20Smr=C5=BE?= Date: Sat, 26 Oct 2019 22:19:56 +0200 Subject: Basic object encoding and storage --- CMakeLists.txt | 12 ++ Makefile | 11 ++ include/erebos/storage.h | 172 ++++++++++++++++ src/CMakeLists.txt | 7 + src/base64.h | 107 ++++++++++ src/storage.cpp | 501 +++++++++++++++++++++++++++++++++++++++++++++++ src/storage.h | 34 ++++ 7 files changed, 844 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 Makefile create mode 100644 include/erebos/storage.h create mode 100644 src/CMakeLists.txt create mode 100644 src/base64.h create mode 100644 src/storage.cpp create mode 100644 src/storage.h diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..66406b1 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.10) +project(Erebos) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(Threads REQUIRED) +find_package(ZLIB REQUIRED) +find_library(B2_LIBRARY b2 REQUIRED) + +add_subdirectory(src) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..acf2c19 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +all: build/Makefile + +make -C build +.PHONY: all + +build/Makefile: + mkdir -p build + (cd build; cmake ..) + +clean: + rm -rf build +.PHONY: clean diff --git a/include/erebos/storage.h b/include/erebos/storage.h new file mode 100644 index 0000000..ae899f5 --- /dev/null +++ b/include/erebos/storage.h @@ -0,0 +1,172 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace erebos { + +class Storage; +class Digest; +class Ref; +class Object; + +class Storage +{ +public: + Storage(const Storage &) = default; + Storage & operator=(const Storage &) = delete; + + static std::optional open(std::filesystem::path path); + std::optional ref(const Digest &) const; + std::optional load(const Digest &) const; + Ref store(const Object &) const; + +private: + friend class Ref; + struct Priv; + const std::shared_ptr p; + Storage(const std::shared_ptr p): p(p) {} +}; + +class Digest +{ +public: + static constexpr size_t size = 32; + + Digest(const Digest &) = default; + Digest & operator=(const Digest &) = delete; + + explicit Digest(std::array value): value(value) {} + explicit Digest(const std::string &); + explicit operator std::string() const; + + bool operator==(const Digest & other) const { return value == other.value; } + bool operator!=(const Digest & other) const { return value != other.value; } + bool operator<(const Digest & other) const { return value < other.value; } + bool operator<=(const Digest & other) const { return value <= other.value; } + bool operator>(const Digest & other) const { return value > other.value; } + bool operator>=(const Digest & other) const { return value >= other.value; } + +private: + std::array value; +}; + +class Ref +{ +public: + Ref(const Ref &) = default; + Ref & operator=(const Ref &) = delete; + + static std::optional create(Storage, const Digest &); + + const Digest & digest() const; + const Object & operator*() const; + const Object * operator->() const; + +private: + friend class Storage; + struct Priv; + const std::shared_ptr p; + Ref(const std::shared_ptr p): p(p) {} +}; + +class Record +{ +public: + class Item { + public: + typedef std::variant< + int, + std::string, + std::vector, + Ref> Variant; + + Item(const std::string & name, Variant value): + name(name), value(value) {} + Item(const Item &) = default; + Item & operator=(const Item &) = delete; + + std::optional asInteger() const; + std::optional asText() const; + std::optional> asBinary() const; + std::optional asRef() const; + + private: + friend class Record; + std::string name; + Variant value; + }; + +private: + Record(const std::shared_ptr> & ptr): + ptr(ptr) {} + +public: + Record(const std::vector &); + std::vector encode() const; + + const std::vector & items() const; + std::optional item(const std::string & name) const; + std::optional operator[](const std::string & name) const; + std::vector items(const std::string & name) const; + +private: + friend class Object; + std::vector encodeInner() const; + static Record decode(Storage, + std::vector::const_iterator, + std::vector::const_iterator); + + const std::shared_ptr> ptr; +}; + +class Blob +{ +public: + Blob(const std::vector &); + + const std::vector & data() const { return *ptr; } + std::vector encode() const; + +private: + friend class Object; + std::vector encodeInner() const; + static Blob decode(Storage, + std::vector::const_iterator, + std::vector::const_iterator); + + Blob(std::shared_ptr> ptr): ptr(ptr) {} + + const std::shared_ptr> ptr; +}; + +class Object +{ +public: + typedef std::variant< + Record, + Blob> Variants; + + Object(const Object &) = default; + Object(Variants content): content(content) {} + Object & operator=(const Object &) = delete; + + static std::optional decode(Storage, const std::vector &); + std::vector encode() const; + + std::optional asRecord() const; + std::optional asBlob() const; + +private: + friend class Record; + friend class Blob; + + Variants content; +}; + +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..00a2cdc --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( + ../include +) + +add_library(erebos + storage +) diff --git a/src/base64.h b/src/base64.h new file mode 100644 index 0000000..324a5dd --- /dev/null +++ b/src/base64.h @@ -0,0 +1,107 @@ +#pragma once + +#include +#include +#include +#include + +namespace { namespace base64 { + + const static char encodeLookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + const static char padCharacter = '='; + + std::string encode(const std::vector & input) + { + std::string encoded; + encoded.reserve(((input.size()/3) + (input.size() % 3 > 0)) * 4); + uint32_t temp; + auto cursor = input.begin(); + for (size_t i = 0; i < input.size() / 3; i++) + { + temp = (*cursor++) << 16; // Convert to big endian + temp += (*cursor++) << 8; + temp += (*cursor++); + encoded.append(1, encodeLookup[(temp & 0x00FC0000) >> 18]); + encoded.append(1, encodeLookup[(temp & 0x0003F000) >> 12]); + encoded.append(1, encodeLookup[(temp & 0x00000FC0) >> 6 ]); + encoded.append(1, encodeLookup[(temp & 0x0000003F) ]); + } + switch (input.size() % 3) + { + case 1: + temp = (*cursor++) << 16; // Convert to big endian + encoded.append(1, encodeLookup[(temp & 0x00FC0000) >> 18]); + encoded.append(1, encodeLookup[(temp & 0x0003F000) >> 12]); + encoded.append(2, padCharacter); + break; + case 2: + temp = (*cursor++) << 16; // Convert to big endian + temp += (*cursor++) << 8; + encoded.append(1, encodeLookup[(temp & 0x00FC0000) >> 18]); + encoded.append(1, encodeLookup[(temp & 0x0003F000) >> 12]); + encoded.append(1, encodeLookup[(temp & 0x00000FC0) >> 6 ]); + encoded.append(1, padCharacter); + break; + } + return encoded; + } + + std::vector decode(const std::string & input) + { + if (input.length() % 4) // Sanity check + throw std::runtime_error("Non-Valid base64!"); + + size_t padding = 0; + if (input.length()) { + if (input[input.length() - 1] == padCharacter) + padding++; + if (input[input.length() - 2] == padCharacter) + padding++; + } + + // Setup a vector to hold the result + std::vector decoded; + decoded.reserve(((input.length()/4)*3) - padding); + uint32_t temp = 0; // Holds decoded quanta + auto cursor = input.begin(); + while (cursor < input.end()) + { + for (size_t quantumPosition = 0; quantumPosition < 4; quantumPosition++) + { + temp <<= 6; + if (*cursor >= 0x41 && *cursor <= 0x5A) // This area will need tweaking if + temp |= *cursor - 0x41; // you are using an alternate alphabet + else if (*cursor >= 0x61 && *cursor <= 0x7A) + temp |= *cursor - 0x47; + else if (*cursor >= 0x30 && *cursor <= 0x39) + temp |= *cursor + 0x04; + else if (*cursor == 0x2B) + temp |= 0x3E; // change to 0x2D for URL alphabet + else if (*cursor == 0x2F) + temp |= 0x3F; // change to 0x5F for URL alphabet + else if (*cursor == padCharacter) // pad + { + switch (input.end() - cursor) + { + case 1: //One pad character + decoded.push_back((temp >> 16) & 0x000000FF); + decoded.push_back((temp >> 8 ) & 0x000000FF); + return decoded; + case 2: //Two pad characters + decoded.push_back((temp >> 10) & 0x000000FF); + return decoded; + default: + throw std::runtime_error("Invalid Padding in Base 64!"); + } + } else + throw std::runtime_error("Non-Valid Character in Base 64!"); + cursor++; + } + decoded.push_back((temp >> 16) & 0x000000FF); + decoded.push_back((temp >> 8 ) & 0x000000FF); + decoded.push_back((temp ) & 0x000000FF); + } + return decoded; + } + +} } diff --git a/src/storage.cpp b/src/storage.cpp new file mode 100644 index 0000000..2e7feb7 --- /dev/null +++ b/src/storage.cpp @@ -0,0 +1,501 @@ +#include "storage.h" +#include "base64.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +using namespace erebos; + +using std::array; +using std::copy; +using std::holds_alternative; +using std::ifstream; +using std::make_shared; +using std::nullopt; +using std::runtime_error; +using std::shared_ptr; +using std::string; +using std::to_string; + +optional Storage::open(fs::path path) +{ + if (!fs::is_directory(path)) + fs::create_directory(path); + + if (!fs::is_directory(path/"objects")) + fs::create_directory(path/"objects"); + + if (!fs::is_directory(path/"heads")) + fs::create_directory(path/"heads"); + + return Storage(shared_ptr(new Priv { path })); +} + +fs::path Storage::Priv::objectPath(const Digest & digest) const +{ + string name(digest); + return root/"objects"/ + fs::path(name.begin(), name.begin() + 2)/ + fs::path(name.begin() + 2, name.end()); +} + +optional Storage::ref(const Digest & digest) const +{ + return Ref::create(*this, digest); +} + +optional> Storage::Priv::loadBytes(const Digest & digest) const +{ + vector in(Priv::CHUNK); + vector out; + size_t decoded = 0; + + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + strm.avail_in = 0; + strm.next_in = Z_NULL; + int ret = inflateInit(&strm); + if (ret != Z_OK) + throw runtime_error("zlib initialization failed"); + + ifstream fin(objectPath(digest), std::ios::binary); + if (!fin.is_open()) + return nullopt; + + while (!fin.eof() && ret != Z_STREAM_END) { + fin.read((char*) in.data(), in.size()); + if (fin.bad()) { + inflateEnd(&strm); + throw runtime_error("failed to read stored file"); + } + strm.avail_in = fin.gcount(); + if (strm.avail_in == 0) + break; + strm.next_in = in.data(); + + do { + if (out.size() < decoded + in.size()) + out.resize(decoded + in.size()); + + strm.avail_out = out.size() - decoded; + strm.next_out = out.data() + decoded; + ret = inflate(&strm, Z_NO_FLUSH); + switch (ret) { + case Z_STREAM_ERROR: + case Z_NEED_DICT: + case Z_DATA_ERROR: + case Z_MEM_ERROR: + inflateEnd(&strm); + throw runtime_error("zlib decoding failed"); + } + decoded = out.size() - strm.avail_out; + } while (strm.avail_out == 0); + } + + + inflateEnd(&strm); + if (ret != Z_STREAM_END) + throw runtime_error("zlib decoding failed"); + + out.resize(decoded); + return out; +} + +optional Storage::load(const Digest & digest) const +{ + auto ocontent = p->loadBytes(digest); + if (!ocontent.has_value()) + return nullopt; + auto content = ocontent.value(); + + array arr; + int ret = blake2b(arr.data(), content.data(), nullptr, + Digest::size, content.size(), 0); + if (ret != 0 || digest != Digest(arr)) + throw runtime_error("digest verification failed"); + + return Object::decode(*this, content); +} + +void Storage::Priv::storeBytes(const Digest & digest, const vector & in) const +{ + vector out(Priv::CHUNK); + + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + int ret = deflateInit(&strm, Z_DEFAULT_COMPRESSION); + if (ret != Z_OK) + throw runtime_error("zlib initialization failed"); + + auto path = objectPath(digest); + auto lock = path; + lock += ".lock"; + + fs::create_directories(path.parent_path()); + + // No way to use open exclusively in c++ stdlib + FILE *f = nullptr; + for (int i = 0; i < 10; i++) { + f = fopen(lock.c_str(), "wbxe"); + if (f || errno != EEXIST) + break; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + if (fs::exists(path)) { + if (f) { + fclose(f); + fs::remove(lock); + } + return; + } + if (!f) + throw runtime_error("failed to open storage file"); + + strm.avail_in = in.size(); + strm.next_in = const_cast(in.data()); + do { + strm.avail_out = out.size(); + strm.next_out = out.data(); + ret = deflate(&strm, Z_FINISH); + if (ret == Z_STREAM_ERROR) + break; + size_t have = out.size() - strm.avail_out; + if (fwrite(out.data(), 1, have, f) != have || ferror(f)) { + ret = Z_ERRNO; + break; + } + } while (strm.avail_out == 0); + + fclose(f); + deflateEnd(&strm); + + if (strm.avail_in != 0 || ret != Z_STREAM_END) { + fs::remove(lock); + throw runtime_error("failed to deflate object"); + } + + fs::rename(lock, path); +} + +Ref Storage::store(const Object & object) const +{ + // TODO: ensure storage transitively + auto content = object.encode(); + + array arr; + int ret = blake2b(arr.data(), content.data(), nullptr, + Digest::size, content.size(), 0); + if (ret != 0) + throw runtime_error("failed to compute digest"); + + Digest digest(arr); + p->storeBytes(digest, content); + return Ref::create(*this, digest).value(); +} + + +Digest::Digest(const string & str) +{ + if (str.size() != 2 * size) + throw runtime_error("invalid ref digest"); + + for (int i = 0; i < size; i++) + std::from_chars(str.data() + 2 * i, + str.data() + 2 * i + 2, + value[i], 16); +} + +Digest::operator string() const +{ + string res(size * 2, '0'); + for (int i = 0; i < size; i++) + std::to_chars(res.data() + 2 * i + (value[i] < 0x10), + res.data() + 2 * i + 2, + value[i], 16); + return res; +} + + +optional Ref::create(Storage st, const Digest & digest) +{ + if (!fs::exists(st.p->objectPath(digest))) + return nullopt; + + auto p = new Priv { + .storage = st, + .digest = digest, + }; + + p->object = std::async(std::launch::deferred, [p] { + auto obj = p->storage.load(p->digest); + if (!obj.has_value()) + throw runtime_error("failed to decode bytes"); + + return obj.value(); + }); + + return Ref(shared_ptr(p)); +} + +const Digest & Ref::digest() const +{ + return p->digest; +} + +const Object & Ref::operator*() const +{ + return p->object.get(); +} + +const Object * Ref::operator->() const +{ + return &p->object.get(); +} + + +optional Record::Item::asInteger() const +{ + if (holds_alternative(value)) + return std::get(value); + return nullopt; +} + +optional Record::Item::asText() const +{ + if (holds_alternative(value)) + return std::get(value); + return nullopt; +} + +optional> Record::Item::asBinary() const +{ + if (holds_alternative>(value)) + return std::get>(value); + return nullopt; +} + +optional Record::Item::asRef() const +{ + if (holds_alternative(value)) + return std::get(value); + return nullopt; +} + + +Record::Record(const vector & from): + ptr(new vector(from)) +{} + +Record Record::decode(Storage st, + vector::const_iterator begin, + vector::const_iterator end) +{ + auto items = make_shared>(); + + while (begin != end) { + const auto newline = std::find(begin, end, '\n'); + if (newline == end) + throw runtime_error("invalid record"); + + const auto colon = std::find(begin, newline, ':'); + if (colon == newline) + throw runtime_error("invalid record"); + + const auto space = std::find(colon, newline, ' '); + if (space == newline) + throw runtime_error("invalid record"); + + const auto name = string(begin, colon); + const auto type = string(colon + 1, space); + const auto value = string(space + 1, newline); + + if (type == "i") + items->emplace_back(name, std::stoi(value)); + else if (type == "t") + items->emplace_back(name, value); + else if (type == "b") + items->emplace_back(name, base64::decode(value)); + else if (type == "r.b2") + items->emplace_back(name, Ref::create(st, Digest(value)).value()); + else + throw runtime_error("unknown record item type"); + + begin = newline + 1; + } + + return Record(items); +} + +vector Record::encode() const +{ + return Object(*this).encode(); +} + +const vector & Record::items() const +{ + return *ptr; +} + +optional Record::item(const string & name) const +{ + for (auto item : *ptr) { + if (item.name == name) + return item; + } + return nullopt; +} + +optional Record::operator[](const string & name) const +{ + return item(name); +} + +vector Record::items(const string & name) const +{ + vector res; + for (auto item : *ptr) { + if (item.name == name) + res.push_back(item); + } + return res; +} + + +vector Record::encodeInner() const +{ + vector res; + auto inserter = std::back_inserter(res); + for (const auto & item : *ptr) { + copy(item.name.begin(), item.name.end(), inserter); + inserter = ':'; + + string type; + string value; + + if (auto x = item.asInteger()) { + type = "i"; + value = to_string(*x); + } else if (auto x = item.asText()) { + type = "t"; + value = *x; + } else if (auto x = item.asBinary()) { + type = "b"; + value = base64::encode(*x); + } else if (auto x = item.asRef()) { + type = "r.b2"; + value = string(x->digest()); + } else { + throw runtime_error("unhandeled record item type"); + } + + copy(type.begin(), type.end(), inserter); + inserter = ' '; + copy(value.begin(), value.end(), inserter); + inserter = '\n'; + } + return res; +} + + +Blob::Blob(const vector & vec): + ptr(make_shared>(vec)) +{} + +vector Blob::encode() const +{ + return Object(*this).encode(); +} + +vector Blob::encodeInner() const +{ + return *ptr; +} + +Blob Blob::decode(Storage, + vector::const_iterator begin, + vector::const_iterator end) +{ + return Blob(make_shared>(begin, end)); +} + + +optional Object::decode(Storage st, const vector & data) +{ + auto newline = std::find(data.begin(), data.end(), '\n'); + if (newline == data.end()) + return nullopt; + + auto space = std::find(data.begin(), newline, ' '); + if (space == newline) + return nullopt; + + size_t size = std::stoi(string(space + 1, newline)); + if (data.end() - newline - 1 != size) + return nullopt; + + string type(data.begin(), space); + if (type == "rec") + return Object(Record::decode(st, newline + 1, data.end())); + else if (type == "blob") + return Object(Blob::decode(st, newline + 1, data.end())); + else + throw runtime_error("unknown object type '" + type + "'"); + + return nullopt; +} + +vector Object::encode() const +{ + vector res, inner; + string type; + + if (auto rec = asRecord()) { + type = "rec"; + inner = rec->encodeInner(); + } else if (auto blob = asBlob()) { + type = "blob"; + inner = blob->encodeInner(); + } else { + throw runtime_error("unhandeled object type"); + } + + auto inserter = std::back_inserter(res); + copy(type.begin(), type.end(), inserter); + inserter = ' '; + + auto slen = to_string(inner.size()); + copy(slen.begin(), slen.end(), inserter); + inserter = '\n'; + + copy(inner.begin(), inner.end(), inserter); + return res; +} + +optional Object::asRecord() const +{ + if (holds_alternative(content)) + return std::get(content); + return nullopt; +} + +optional Object::asBlob() const +{ + if (holds_alternative(content)) + return std::get(content); + return nullopt; +} diff --git a/src/storage.h b/src/storage.h new file mode 100644 index 0000000..2a0ad7e --- /dev/null +++ b/src/storage.h @@ -0,0 +1,34 @@ +#pragma once + +#include "erebos/storage.h" + +#include + +namespace fs = std::filesystem; + +using std::optional; +using std::shared_future; +using std::vector; + +namespace erebos { + +struct Storage::Priv +{ + static constexpr size_t CHUNK = 16384; + + fs::path root; + + fs::path objectPath(const Digest &) const; + optional> loadBytes(const Digest &) const; + void storeBytes(const Digest &, const vector &) const; +}; + +struct Ref::Priv +{ + Storage storage; + Digest digest; + + shared_future object; +}; + +} -- cgit v1.2.3