Refactoring of tokenizer and stream classes for better efficiency and maintainability.

This commit is contained in:
2017-06-21 02:56:27 +02:00
parent 97c6e58355
commit 39c0e27cb2
38 changed files with 466 additions and 512 deletions

View File

@@ -1,6 +1,8 @@
#ifndef __TOKENIZE__LOCATION_H
#define __TOKENIZE__LOCATION_H
#include <string>
#include <tokenize/StreamPosition.h>
namespace tokenize
@@ -16,37 +18,19 @@ class Stream;
////////////////////////////////////////////////////////////////////////////////////////////////////
class Location
struct Location
{
public:
Location(Stream &stream);
Location(Stream &stream, StreamPosition position);
StreamPosition position{InvalidStreamPosition};
const char *sectionStart() const;
const char *sectionEnd() const;
// TODO: think about avoiding copying strings
std::string sectionStart;
std::string sectionEnd;
StreamPosition rowStart() const;
StreamPosition rowEnd() const;
StreamPosition rowStart{InvalidStreamPosition};
StreamPosition rowEnd{InvalidStreamPosition};
StreamPosition columnStart() const;
StreamPosition columnEnd() const;
private:
void initializeLazily() const;
Stream &m_stream;
const StreamPosition m_position;
mutable bool m_isInitialized{false};
mutable const char *m_sectionStart{nullptr};
mutable const char *m_sectionEnd{nullptr};
mutable StreamPosition m_rowStart{InvalidStreamPosition};
mutable StreamPosition m_rowEnd{InvalidStreamPosition};
mutable StreamPosition m_columnStart{InvalidStreamPosition};
mutable StreamPosition m_columnEnd{InvalidStreamPosition};
StreamPosition columnStart{InvalidStreamPosition};
StreamPosition columnEnd{InvalidStreamPosition};
};
////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -1,8 +1,10 @@
#ifndef __TOKENIZE__STREAM_H
#define __TOKENIZE__STREAM_H
#include <algorithm>
#include <cassert>
#include <experimental/filesystem>
#include <fstream>
#include <iostream>
#include <iterator>
#include <sstream>
@@ -24,38 +26,123 @@ namespace tokenize
class Stream
{
public:
struct Delimiter
struct Section
{
StreamPosition position;
std::string sectionName;
std::string name;
std::vector<StreamPosition> newlines;
};
public:
Stream();
explicit Stream(std::string streamName, std::istream &istream);
Stream()
{
std::setlocale(LC_NUMERIC, "C");
}
explicit Stream(std::string streamName, std::istream &istream)
{
read(streamName, istream);
}
~Stream() = default;
Stream(const Stream &other) = delete;
Stream &operator=(const Stream &other) = delete;
Stream(Stream &&other) = delete;
Stream &operator=(Stream &&other) = delete;
Stream(Stream &&other) = default;
Stream &operator=(Stream &&other) = default;
void read(std::string streamName, std::istream &istream);
void read(const std::experimental::filesystem::path &path);
void reset();
void seek(StreamPosition position);
StreamPosition position() const;
const std::vector<Delimiter> &delimiters() const
void read(std::string streamName, std::istream &istream)
{
return m_delimiters;
// Store position of new section
m_sections.push_back({m_content.size(), streamName, {}});
const auto contentStartIndex = m_content.size();
try
{
istream.seekg(0, std::ios::end);
const auto streamSize = istream.tellg();
istream.seekg(0, std::ios::beg);
m_content.reserve(m_content.size() + streamSize);
}
catch (const std::exception &exception)
{
istream.clear();
}
std::copy(std::istreambuf_iterator<char>(istream), std::istreambuf_iterator<char>(), std::back_inserter(m_content));
for (auto i = contentStartIndex; i < m_content.size(); i++)
if (m_content[i] == '\n')
m_sections.back().newlines.emplace_back(i);
}
void read(const std::experimental::filesystem::path &path)
{
if (!std::experimental::filesystem::is_regular_file(path))
throw std::runtime_error("File does not exist: “" + path.string() + "");
std::ifstream fileStream(path.string(), std::ios::in);
read(path.string(), fileStream);
}
void reset()
{
m_position = 0;
}
void seek(StreamPosition position)
{
m_position = position;
}
StreamPosition position() const
{
return m_position;
}
Location location() const
{
// Find current section
auto section = std::upper_bound(m_sections.cbegin(), m_sections.cend(), m_position,
[&](const auto &lhs, const auto &rhs)
{
return lhs < rhs.position;
});
assert(section != m_sections.cbegin());
section--;
// Find line (row) in the file
auto line = std::lower_bound(section->newlines.cbegin(), section->newlines.cend(), m_position);
if (line == section->newlines.cbegin())
{
const auto row = 1;
const auto column = static_cast<StreamPosition>(m_position - section->position + 1);
return {m_position, section->name, section->name, row, row, column, column};
}
const auto row = static_cast<StreamPosition>(line - section->newlines.cbegin() + 1);
const auto column = static_cast<StreamPosition>(m_position - *(line - 1));
return {m_position, section->name, section->name, row, row, column, column};
}
const std::vector<Section> &sections() const
{
return m_sections;
}
char currentCharacter()
{
check();
return m_stream[m_position];
return m_content[m_position];
}
void advance()
@@ -64,22 +151,42 @@ class Stream
m_position++;
}
void advanceUnchecked()
{
m_position++;
}
bool atEnd() const
{
return m_position >= m_stream.size();
return m_position >= m_content.size();
}
void check()
{
if (atEnd())
throw TokenizerException(*this, "reading past end of file");
throw TokenizerException(location(), "reading past end of file");
}
StreamPosition size() const
{
return m_content.size();
}
std::string &content()
{
return m_content;
}
const std::string &content() const
{
return m_content;
}
protected:
std::string m_stream;
std::string m_content;
mutable StreamPosition m_position{0};
std::vector<Delimiter> m_delimiters;
std::vector<Section> m_sections;
};
////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -27,20 +27,19 @@ struct Tag
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class TokenizerPolicy = CaseSensitiveTokenizerPolicy>
class Tokenizer: public Stream, public TokenizerPolicy
class Tokenizer : public Stream, public TokenizerPolicy
{
template<class OtherTokenizerPolicy>
friend class Tokenizer;
public:
explicit Tokenizer();
explicit Tokenizer() noexcept;
explicit Tokenizer(std::string streamName, std::istream &istream);
template<class OtherTokenizer>
Tokenizer(OtherTokenizer &&otherTokenizer)
Tokenizer(OtherTokenizer &&other) noexcept
: Stream(std::forward<OtherTokenizer>(other))
{
m_stream = std::move(otherTokenizer.m_stream);
m_delimiters = std::move(otherTokenizer.m_delimiters);
}
void removeComments(const std::string &startSequence, const std::string &endSequence, bool removeEnd);
@@ -94,8 +93,7 @@ class Tokenizer: public Stream, public TokenizerPolicy
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class TokenizerPolicy>
Tokenizer<TokenizerPolicy>::Tokenizer()
: Stream()
Tokenizer<TokenizerPolicy>::Tokenizer() noexcept
{
}
@@ -189,7 +187,7 @@ void Tokenizer<TokenizerPolicy>::expect(const Type &expectedValue)
std::stringstream message;
message << "unexpected value, expected “" << expectedValue << "";
throw TokenizerException(*this, message.str());
throw TokenizerException(location(), message.str());
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -208,7 +206,7 @@ std::string Tokenizer<TokenizerPolicy>::getIdentifier()
if (!TokenizerPolicy::isIdentifierCharacter(character))
{
if (value.empty())
throw TokenizerException(*this, "could not parse identifier");
throw TokenizerException(location(), "could not parse identifier");
return value;
}
@@ -289,31 +287,31 @@ template<class TokenizerPolicy>
void Tokenizer<TokenizerPolicy>::removeComments(const std::string &startSequence, const std::string &endSequence, bool removeEnd)
{
// TODO: move to appropriate place
for (auto &character : m_stream)
for (auto &character : m_content)
character = TokenizerPolicy::transformCharacter(character);
const auto removeRange =
[&](const auto &start, const auto &end)
{
const auto previousPosition = m_position;
const auto previousPosition = position();
assert(start < m_stream.size());
assert(start < m_content.size());
m_position = start;
seek(start);
while (m_position < end)
while (position() < end)
{
if (atEnd())
return;
m_stream[m_position] = ' ';
m_position++;
m_content[position()] = ' ';
advanceUnchecked();
}
m_position = previousPosition;
seek(previousPosition);
};
m_position = 0;
seek(0);
// TODO: refactor
while (!atEnd())
@@ -325,13 +323,13 @@ void Tokenizer<TokenizerPolicy>::removeComments(const std::string &startSequence
if ((startSequenceFound = testAndSkip(startSequence)))
break;
advance();
advanceUnchecked();
}
if (!startSequenceFound && atEnd())
break;
const auto startPosition = m_position - startSequence.size();
const auto startPosition = position() - startSequence.size();
bool endSequenceFound = false;
@@ -340,21 +338,21 @@ void Tokenizer<TokenizerPolicy>::removeComments(const std::string &startSequence
if ((endSequenceFound = testAndSkip(endSequence)))
break;
advance();
advanceUnchecked();
}
// If the end sequence is to be removed or could not be found, remove entire range
const auto endPosition =
(removeEnd || !endSequenceFound)
? m_position
: m_position - endSequence.size();
? position()
: position() - endSequence.size();
removeRange(startPosition, endPosition);
m_position = endPosition + 1;
seek(endPosition + 1);
}
m_position = 0;
seek(0);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -406,7 +404,7 @@ uint64_t Tokenizer<TokenizerPolicy>::getIntegerBody()
check();
if (!std::isdigit(currentCharacter()))
throw TokenizerException(*this, "could not read integer value");
throw TokenizerException(location(), "could not read integer value");
uint64_t value = 0;
@@ -420,7 +418,7 @@ uint64_t Tokenizer<TokenizerPolicy>::getIntegerBody()
value *= 10;
value += character - '0';
advance();
advanceUnchecked();
}
return value;
@@ -448,7 +446,7 @@ uint64_t Tokenizer<TokenizerPolicy>::getImpl(Tag<uint64_t>)
skipWhiteSpace();
if (currentCharacter() == '-')
throw TokenizerException(*this, "expected unsigned integer, got signed one");
throw TokenizerException(location(), "expected unsigned integer, got signed one");
return getIntegerBody();
}
@@ -482,7 +480,7 @@ bool Tokenizer<TokenizerPolicy>::getImpl(Tag<bool>)
if (testAndSkip<char>('1'))
return true;
throw TokenizerException(*this, "could not read Boolean value");
throw TokenizerException(location(), "could not read Boolean value");
}
////////////////////////////////////////////////////////////////////////////////////////////////////