Basic object encoding and storage

author: Roman Smrž <roman.smrz@seznam.cz> 2019-10-26 22:19:56 +0200
committer: Roman Smrž <roman.smrz@seznam.cz> 2019-12-10 21:29:28 +0100
commit: 361891f25ca735fd85db64a14823cc55b8a0619a (patch)
tree: d45734fbc123d8dcb26128b314eb4453ecdd154e /src
parent: 1a1b36dea942cd7b18067f3f1220c9ab4f9b4448 (diff)
4 files changed, 649 insertions, 0 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..00a2cdc
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories(
+	../include
+)
+
+add_library(erebos
+	storage
+)
diff --git a/src/base64.h b/src/base64.h
new file mode 100644
index 0000000..324a5dd
--- /dev/null
+++ b/src/base64.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace { namespace base64 {
+
+	const static char encodeLookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+	const static char padCharacter = '=';
+
+	std::string encode(const std::vector<uint8_t> & input)
+	{
+		std::string encoded;
+		encoded.reserve(((input.size()/3) + (input.size() % 3 > 0)) * 4);
+		uint32_t temp;
+		auto cursor = input.begin();
+		for (size_t i = 0; i < input.size() / 3; i++)
+		{
+			temp  = (*cursor++) << 16; // Convert to big endian
+			temp += (*cursor++) << 8;
+			temp += (*cursor++);
+			encoded.append(1, encodeLookup[(temp & 0x00FC0000) >> 18]);
+			encoded.append(1, encodeLookup[(temp & 0x0003F000) >> 12]);
+			encoded.append(1, encodeLookup[(temp & 0x00000FC0) >> 6 ]);
+			encoded.append(1, encodeLookup[(temp & 0x0000003F)      ]);
+		}
+		switch (input.size() % 3)
+		{
+		case 1:
+			temp  = (*cursor++) << 16; // Convert to big endian
+			encoded.append(1, encodeLookup[(temp & 0x00FC0000) >> 18]);
+			encoded.append(1, encodeLookup[(temp & 0x0003F000) >> 12]);
+			encoded.append(2, padCharacter);
+			break;
+		case 2:
+			temp  = (*cursor++) << 16; // Convert to big endian
+			temp += (*cursor++) << 8;
+			encoded.append(1, encodeLookup[(temp & 0x00FC0000) >> 18]);
+			encoded.append(1, encodeLookup[(temp & 0x0003F000) >> 12]);
+			encoded.append(1, encodeLookup[(temp & 0x00000FC0) >> 6 ]);
+			encoded.append(1, padCharacter);
+			break;
+		}
+		return encoded;
+	}
+
+	std::vector<uint8_t> decode(const std::string & input)
+	{
+		if (input.length() % 4) // Sanity check
+			throw std::runtime_error("Non-Valid base64!");
+
+		size_t padding = 0;
+		if (input.length()) {
+			if (input[input.length() - 1] == padCharacter)
+				padding++;
+			if (input[input.length() - 2] == padCharacter)
+				padding++;
+		}
+
+		// Setup a vector to hold the result
+		std::vector<uint8_t> decoded;
+		decoded.reserve(((input.length()/4)*3) - padding);
+		uint32_t temp = 0; // Holds decoded quanta
+		auto cursor = input.begin();
+		while (cursor < input.end())
+		{
+			for (size_t quantumPosition = 0; quantumPosition < 4; quantumPosition++)
+			{
+				temp <<= 6;
+				if      (*cursor >= 0x41 && *cursor <= 0x5A)   // This area will need tweaking if
+					temp |= *cursor - 0x41;                // you are using an alternate alphabet
+				else if (*cursor >= 0x61 && *cursor <= 0x7A)
+					temp |= *cursor - 0x47;
+				else if (*cursor >= 0x30 && *cursor <= 0x39)
+					temp |= *cursor + 0x04;
+				else if (*cursor == 0x2B)
+					temp |= 0x3E; // change to 0x2D for URL alphabet
+				else if (*cursor == 0x2F)
+					temp |= 0x3F; // change to 0x5F for URL alphabet
+				else if (*cursor == padCharacter) // pad
+				{
+					switch (input.end() - cursor)
+					{
+					case 1: //One pad character
+						decoded.push_back((temp >> 16) & 0x000000FF);
+						decoded.push_back((temp >> 8 ) & 0x000000FF);
+						return decoded;
+					case 2: //Two pad characters
+						decoded.push_back((temp >> 10) & 0x000000FF);
+						return decoded;
+					default:
+						throw std::runtime_error("Invalid Padding in Base 64!");
+					}
+				}  else
+					throw std::runtime_error("Non-Valid Character in Base 64!");
+				cursor++;
+			}
+			decoded.push_back((temp >> 16) & 0x000000FF);
+			decoded.push_back((temp >> 8 ) & 0x000000FF);
+			decoded.push_back((temp      ) & 0x000000FF);
+		}
+		return decoded;
+	}
+
+} }
diff --git a/src/storage.cpp b/src/storage.cpp
new file mode 100644
index 0000000..2e7feb7
--- /dev/null
+++ b/src/storage.cpp
@@ -0,0 +1,501 @@
+#include "storage.h"
+#include "base64.h"
+
+#include <algorithm>
+#include <charconv>
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iterator>
+#include <stdexcept>
+#include <thread>
+
+#include <stdio.h>
+
+#include <blake2.h>
+#include <zlib.h>
+
+using namespace erebos;
+
+using std::array;
+using std::copy;
+using std::holds_alternative;
+using std::ifstream;
+using std::make_shared;
+using std::nullopt;
+using std::runtime_error;
+using std::shared_ptr;
+using std::string;
+using std::to_string;
+
+optional<Storage> Storage::open(fs::path path)
+{
+	if (!fs::is_directory(path))
+		fs::create_directory(path);
+
+	if (!fs::is_directory(path/"objects"))
+		fs::create_directory(path/"objects");
+
+	if (!fs::is_directory(path/"heads"))
+		fs::create_directory(path/"heads");
+
+	return Storage(shared_ptr<const Priv>(new Priv { path }));
+}
+
+fs::path Storage::Priv::objectPath(const Digest & digest) const
+{
+	string name(digest);
+	return root/"objects"/
+		fs::path(name.begin(), name.begin() + 2)/
+		fs::path(name.begin() + 2, name.end());
+}
+
+optional<Ref> Storage::ref(const Digest & digest) const
+{
+	return Ref::create(*this, digest);
+}
+
+optional<vector<uint8_t>> Storage::Priv::loadBytes(const Digest & digest) const
+{
+	vector<uint8_t> in(Priv::CHUNK);
+	vector<uint8_t> out;
+	size_t decoded = 0;
+
+	z_stream strm;
+	strm.zalloc = Z_NULL;
+	strm.zfree = Z_NULL;
+	strm.opaque = Z_NULL;
+	strm.avail_in = 0;
+	strm.next_in = Z_NULL;
+	int ret = inflateInit(&strm);
+	if (ret != Z_OK)
+		throw runtime_error("zlib initialization failed");
+
+	ifstream fin(objectPath(digest), std::ios::binary);
+	if (!fin.is_open())
+		return nullopt;
+
+	while (!fin.eof() && ret != Z_STREAM_END) {
+		fin.read((char*) in.data(), in.size());
+		if (fin.bad()) {
+			inflateEnd(&strm);
+			throw runtime_error("failed to read stored file");
+		}
+		strm.avail_in = fin.gcount();
+		if (strm.avail_in == 0)
+			break;
+		strm.next_in = in.data();
+
+		do {
+			if (out.size() < decoded + in.size())
+				out.resize(decoded + in.size());
+
+			strm.avail_out = out.size() - decoded;
+			strm.next_out = out.data() + decoded;
+			ret = inflate(&strm, Z_NO_FLUSH);
+			switch (ret) {
+			case Z_STREAM_ERROR:
+			case Z_NEED_DICT:
+			case Z_DATA_ERROR:
+			case Z_MEM_ERROR:
+				inflateEnd(&strm);
+				throw runtime_error("zlib decoding failed");
+			}
+			decoded = out.size() - strm.avail_out;
+		} while (strm.avail_out == 0);
+	}
+
+
+	inflateEnd(&strm);
+	if (ret != Z_STREAM_END)
+		throw runtime_error("zlib decoding failed");
+
+	out.resize(decoded);
+	return out;
+}
+
+optional<Object> Storage::load(const Digest & digest) const
+{
+	auto ocontent = p->loadBytes(digest);
+	if (!ocontent.has_value())
+		return nullopt;
+	auto content = ocontent.value();
+
+	array<uint8_t, Digest::size> arr;
+	int ret = blake2b(arr.data(), content.data(), nullptr,
+			Digest::size, content.size(), 0);
+	if (ret != 0 || digest != Digest(arr))
+		throw runtime_error("digest verification failed");
+
+	return Object::decode(*this, content);
+}
+
+void Storage::Priv::storeBytes(const Digest & digest, const vector<uint8_t> & in) const
+{
+	vector<uint8_t> out(Priv::CHUNK);
+
+	z_stream strm;
+	strm.zalloc = Z_NULL;
+	strm.zfree = Z_NULL;
+	strm.opaque = Z_NULL;
+	int ret = deflateInit(&strm, Z_DEFAULT_COMPRESSION);
+	if (ret != Z_OK)
+		throw runtime_error("zlib initialization failed");
+
+	auto path = objectPath(digest);
+	auto lock = path;
+	lock += ".lock";
+
+	fs::create_directories(path.parent_path());
+
+	// No way to use open exclusively in c++ stdlib
+	FILE *f = nullptr;
+	for (int i = 0; i < 10; i++) {
+		f = fopen(lock.c_str(), "wbxe");
+		if (f || errno != EEXIST)
+			break;
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+	}
+	if (fs::exists(path)) {
+		if (f) {
+			fclose(f);
+			fs::remove(lock);
+		}
+		return;
+	}
+	if (!f)
+		throw runtime_error("failed to open storage file");
+
+	strm.avail_in = in.size();
+	strm.next_in = const_cast<uint8_t*>(in.data());
+	do {
+		strm.avail_out = out.size();
+		strm.next_out = out.data();
+		ret = deflate(&strm, Z_FINISH);
+		if (ret == Z_STREAM_ERROR)
+			break;
+		size_t have = out.size() - strm.avail_out;
+		if (fwrite(out.data(), 1, have, f) != have || ferror(f)) {
+			ret = Z_ERRNO;
+			break;
+		}
+	} while (strm.avail_out == 0);
+
+	fclose(f);
+	deflateEnd(&strm);
+
+	if (strm.avail_in != 0 || ret != Z_STREAM_END) {
+		fs::remove(lock);
+		throw runtime_error("failed to deflate object");
+	}
+
+	fs::rename(lock, path);
+}
+
+Ref Storage::store(const Object & object) const
+{
+	// TODO: ensure storage transitively
+	auto content = object.encode();
+
+	array<uint8_t, Digest::size> arr;
+	int ret = blake2b(arr.data(), content.data(), nullptr,
+			Digest::size, content.size(), 0);
+	if (ret != 0)
+		throw runtime_error("failed to compute digest");
+
+	Digest digest(arr);
+	p->storeBytes(digest, content);
+	return Ref::create(*this, digest).value();
+}
+
+
+Digest::Digest(const string & str)
+{
+	if (str.size() != 2 * size)
+		throw runtime_error("invalid ref digest");
+
+	for (int i = 0; i < size; i++)
+		std::from_chars(str.data() + 2 * i,
+				str.data() + 2 * i + 2,
+				value[i], 16);
+}
+
+Digest::operator string() const
+{
+	string res(size * 2, '0');
+	for (int i = 0; i < size; i++)
+		std::to_chars(res.data() + 2 * i + (value[i] < 0x10),
+				res.data() + 2 * i + 2,
+				value[i], 16);
+	return res;
+}
+
+
+optional<Ref> Ref::create(Storage st, const Digest & digest)
+{
+	if (!fs::exists(st.p->objectPath(digest)))
+		return nullopt;
+
+	auto p = new Priv {
+		.storage = st,
+		.digest = digest,
+	};
+
+	p->object = std::async(std::launch::deferred, [p] {
+		auto obj = p->storage.load(p->digest);
+		if (!obj.has_value())
+			throw runtime_error("failed to decode bytes");
+
+		return obj.value();
+	});
+
+	return Ref(shared_ptr<Priv>(p));
+}
+
+const Digest & Ref::digest() const
+{
+	return p->digest;
+}
+
+const Object & Ref::operator*() const
+{
+	return p->object.get();
+}
+
+const Object * Ref::operator->() const
+{
+	return &p->object.get();
+}
+
+
+optional<int> Record::Item::asInteger() const
+{
+	if (holds_alternative<int>(value))
+		return std::get<int>(value);
+	return nullopt;
+}
+
+optional<string> Record::Item::asText() const
+{
+	if (holds_alternative<string>(value))
+		return std::get<string>(value);
+	return nullopt;
+}
+
+optional<vector<uint8_t>> Record::Item::asBinary() const
+{
+	if (holds_alternative<vector<uint8_t>>(value))
+		return std::get<vector<uint8_t>>(value);
+	return nullopt;
+}
+
+optional<Ref> Record::Item::asRef() const
+{
+	if (holds_alternative<Ref>(value))
+		return std::get<Ref>(value);
+	return nullopt;
+}
+
+
+Record::Record(const vector<Item> & from):
+	ptr(new vector<Item>(from))
+{}
+
+Record Record::decode(Storage st,
+		vector<uint8_t>::const_iterator begin,
+		vector<uint8_t>::const_iterator end)
+{
+	auto items = make_shared<vector<Item>>();
+
+	while (begin != end) {
+		const auto newline = std::find(begin, end, '\n');
+		if (newline == end)
+			throw runtime_error("invalid record");
+
+		const auto colon = std::find(begin, newline, ':');
+		if (colon == newline)
+			throw runtime_error("invalid record");
+
+		const auto space = std::find(colon, newline, ' ');
+		if (space == newline)
+			throw runtime_error("invalid record");
+
+		const auto name = string(begin, colon);
+		const auto type = string(colon + 1, space);
+		const auto value = string(space + 1, newline);
+
+		if (type == "i")
+			items->emplace_back(name, std::stoi(value));
+		else if (type == "t")
+			items->emplace_back(name, value);
+		else if (type == "b")
+			items->emplace_back(name, base64::decode(value));
+		else if (type == "r.b2")
+			items->emplace_back(name, Ref::create(st, Digest(value)).value());
+		else
+			throw runtime_error("unknown record item type");
+
+		begin = newline + 1;
+	}
+
+	return Record(items);
+}
+
+vector<uint8_t> Record::encode() const
+{
+	return Object(*this).encode();
+}
+
+const vector<Record::Item> & Record::items() const
+{
+	return *ptr;
+}
+
+optional<Record::Item> Record::item(const string & name) const
+{
+	for (auto item : *ptr) {
+		if (item.name == name)
+			return item;
+	}
+	return nullopt;
+}
+
+optional<Record::Item> Record::operator[](const string & name) const
+{
+	return item(name);
+}
+
+vector<Record::Item> Record::items(const string & name) const
+{
+	vector<Item> res;
+	for (auto item : *ptr) {
+		if (item.name == name)
+			res.push_back(item);
+	}
+	return res;
+}
+
+
+vector<uint8_t> Record::encodeInner() const
+{
+	vector<uint8_t> res;
+	auto inserter = std::back_inserter(res);
+	for (const auto & item : *ptr) {
+		copy(item.name.begin(), item.name.end(), inserter);
+		inserter = ':';
+
+		string type;
+		string value;
+
+		if (auto x = item.asInteger()) {
+			type = "i";
+			value = to_string(*x);
+		} else if (auto x = item.asText()) {
+			type = "t";
+			value = *x;
+		} else if (auto x = item.asBinary()) {
+			type = "b";
+			value = base64::encode(*x);
+		} else if (auto x = item.asRef()) {
+			type = "r.b2";
+			value = string(x->digest());
+		} else {
+			throw runtime_error("unhandeled record item type");
+		}
+
+		copy(type.begin(), type.end(), inserter);
+		inserter = ' ';
+		copy(value.begin(), value.end(), inserter);
+		inserter = '\n';
+	}
+	return res;
+}
+
+
+Blob::Blob(const vector<uint8_t> & vec):
+	ptr(make_shared<vector<uint8_t>>(vec))
+{}
+
+vector<uint8_t> Blob::encode() const
+{
+	return Object(*this).encode();
+}
+
+vector<uint8_t> Blob::encodeInner() const
+{
+	return *ptr;
+}
+
+Blob Blob::decode(Storage,
+		vector<uint8_t>::const_iterator begin,
+		vector<uint8_t>::const_iterator end)
+{
+	return Blob(make_shared<vector<uint8_t>>(begin, end));
+}
+
+
+optional<Object> Object::decode(Storage st, const vector<uint8_t> & data)
+{
+	auto newline = std::find(data.begin(), data.end(), '\n');
+	if (newline == data.end())
+		return nullopt;
+
+	auto space = std::find(data.begin(), newline, ' ');
+	if (space == newline)
+		return nullopt;
+
+	size_t size = std::stoi(string(space + 1, newline));
+	if (data.end() - newline - 1 != size)
+		return nullopt;
+
+	string type(data.begin(), space);
+	if (type == "rec")
+		return Object(Record::decode(st, newline + 1, data.end()));
+	else if (type == "blob")
+		return Object(Blob::decode(st, newline + 1, data.end()));
+	else
+		throw runtime_error("unknown object type '" + type + "'");
+
+	return nullopt;
+}
+
+vector<uint8_t> Object::encode() const
+{
+	vector<uint8_t> res, inner;
+	string type;
+
+	if (auto rec = asRecord()) {
+		type = "rec";
+		inner = rec->encodeInner();
+	} else if (auto blob = asBlob()) {
+		type = "blob";
+		inner = blob->encodeInner();
+	} else {
+		throw runtime_error("unhandeled object type");
+	}
+
+	auto inserter = std::back_inserter(res);
+	copy(type.begin(), type.end(), inserter);
+	inserter = ' ';
+
+	auto slen = to_string(inner.size());
+	copy(slen.begin(), slen.end(), inserter);
+	inserter = '\n';
+
+	copy(inner.begin(), inner.end(), inserter);
+	return res;
+}
+
+optional<Record> Object::asRecord() const
+{
+	if (holds_alternative<Record>(content))
+		return std::get<Record>(content);
+	return nullopt;
+}
+
+optional<Blob> Object::asBlob() const
+{
+	if (holds_alternative<Blob>(content))
+		return std::get<Blob>(content);
+	return nullopt;
+}
diff --git a/src/storage.h b/src/storage.h
new file mode 100644
index 0000000..2a0ad7e
--- /dev/null
+++ b/src/storage.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "erebos/storage.h"
+
+#include <future>
+
+namespace fs = std::filesystem;
+
+using std::optional;
+using std::shared_future;
+using std::vector;
+
+namespace erebos {
+
+struct Storage::Priv
+{
+	static constexpr size_t CHUNK = 16384;
+
+	fs::path root;
+
+	fs::path objectPath(const Digest &) const;
+	optional<vector<uint8_t>> loadBytes(const Digest &) const;
+	void storeBytes(const Digest &, const vector<uint8_t> &) const;
+};
+
+struct Ref::Priv
+{
+	Storage storage;
+	Digest digest;
+
+	shared_future<Object> object;
+};
+
+}
author	Roman Smrž <roman.smrz@seznam.cz>	2019-10-26 22:19:56 +0200
committer	Roman Smrž <roman.smrz@seznam.cz>	2019-12-10 21:29:28 +0100
commit	361891f25ca735fd85db64a14823cc55b8a0619a (patch)
tree	d45734fbc123d8dcb26128b314eb4453ecdd154e /src
parent	1a1b36dea942cd7b18067f3f1220c9ab4f9b4448 (diff)