Scid  4.7.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
pgn_lexer.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2018 Fulvio Benini.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
20  * THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 /** @file
24  * Split input into PGN tokens and dispatch them to a "visiting" parser.
25  */
26 
27 #ifndef _PGN_LEXER_H
28 #define _PGN_LEXER_H
29 
30 #include <algorithm>
31 #include <cassert>
32 
33 namespace pgn_impl {
34 // "PGN character data is organized as tokens. A token is a contiguous
35 // sequence of characters that represents a basic semantic unit. Tokens
36 // may be separated from adjacent tokens by white space characters.
37 // (White space characters include space, newline, and tab characters.)
38 // Some tokens are self delimiting and do not require white space
39 // characters."
40 
41 /**
42  * Creates a 128 bits bitmap of PGN symbol characters.
43  *
44  * "A symbol token starts with a letter or digit character and is immediately
45  * followed by a sequence of zero or more symbol continuation characters.
46  * These continuation characters are letter characters ("A-Za-z"), digit
47  * characters ("0-9"), the underscore ("_"), the plus sign ("+"), the octothorpe
48  * sign ("#"), the equal sign ("="), the colon (":"), and the hyphen ("-")."
49  * @param elem: 0 for the lower 64 bits, 1 for the higher 64 bits.
50  * @returns the requested half of the bitmap.
51  */
52 constexpr unsigned long long init_symbol_map(unsigned elem) {
53  return (elem == 0) ? 0x27ffb80800000000 : 0x47fffffe87ffffff;
54 
55  /* Requires gcc >= 6.2 or clang >= 3.5
56 
57  unsigned long long res[2] = {0};
58  for (unsigned ch = 'A'; ch <= 'Z'; ++ch) {
59  res[ch / 64] |= (1ULL << (ch % 64));
60  }
61  for (unsigned ch = 'a'; ch <= 'z'; ++ch) {
62  res[ch / 64] |= (1ULL << (ch % 64));
63  }
64  for (unsigned ch = '0'; ch <= '9'; ++ch) {
65  res[ch / 64] |= (1ULL << (ch % 64));
66  }
67  const unsigned extra[] = {'_', '+', '#', '=', ':', '-'};
68  for (unsigned ch : extra) {
69  res[ch / 64] |= (1ULL << (ch % 64));
70  }
71  const unsigned drawresult_unclear[] = {'/', '~'};
72  for (unsigned ch : drawresult_unclear) {
73  res[ch / 64] |= (1ULL << (ch % 64));
74  }
75  const unsigned chess_variants[] = {',', '@'};
76  for (unsigned ch : chess_variants) {
77  res[ch / 64] |= (1ULL << (ch % 64));
78  }
79  return res[elem];
80  */
81 }
82 
83 /**
84  * Checks if the given character is a PGN symbol.
85  * @param ch: character to classify.
86  * @returns true if @e ch is a PGN symbol character, false otherwise.
87  */
88 inline bool is_PGNsymbol(unsigned char ch) {
89  constexpr unsigned long long tok_map[] = {init_symbol_map(0),
90  init_symbol_map(1)};
91  auto high = ch / 64;
92  auto low = ch % 64;
93  return high > 1 ? false : tok_map[high] & (1ULL << low);
94 }
95 
96 /**
97  * Checks if the given character is one of the 10 decimal digits: 0123456789.
98  * @param ch: character to classify.
99  * @returns true if the character is a numeric character, false otherwise.
100  */
101 inline bool is_PGNdigit(unsigned char ch) { return ch >= '0' && ch <= '9'; }
102 
103 /**
104  * Checks if the given character is a white space ("white space characters
105  * include space, newline, and tab characters").
106  * @param ch: character to classify.
107  * @returns true if the character is a white space, false otherwise.
108  */
109 inline bool is_PGNwhitespace(unsigned char ch) {
110  return (ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v');
111 }
112 
113 /**
114  * Checks if a token is the game termination marker.
115  *
116  * "The game termination marker is a symbol that is one of the following four
117  * values: "1-0" (White wins), "0-1" (Black wins), "1/2-1/2" (drawn game),
118  * and "*" (game in progress, result unknown, or game abandoned)."
119  * @param tok: the token to classify.
120  * @returns
121  * - '1' for "White wins",
122  * - '0' for "Black wins",
123  * - '/' for "drawn game",
124  * - '*' for "result unknown",
125  * - 0 (false) if it's not a termination marker.
126  */
127 template <typename TView> char is_PGNtermination(TView tok) {
128  auto n_chars = std::distance(tok.first, tok.second);
129  if (n_chars == 3) {
130  if (std::equal(tok.first, tok.first + 3, "1-0"))
131  return '1';
132  if (std::equal(tok.first, tok.first + 3, "0-1"))
133  return '0';
134  if (std::equal(tok.first, tok.first + 3, "1/2"))
135  return '/';
136  if (std::equal(tok.first, tok.first + 3, "1:0"))
137  return '1';
138  if (std::equal(tok.first, tok.first + 3, "0:1"))
139  return '0';
140  } else if (n_chars == 7) {
141  if (std::equal(tok.first, tok.first + 7, "1/2-1/2") ||
142  std::equal(tok.first, tok.first + 7, "1/2:1/2"))
143  return '/';
144  }
145  return 0;
146 }
147 
148 /**
149  * Read a token and dispatch it to a PGN parser.
150  * The first char of the token is used to determine its termination.
151  * @param ch: the first char of the token.
152  * @param input: the input to get data from.
153  * @param parser: will receive the tokens via visitPGN_* functions.
154  * @param section: -1 pregame, 0 for tag pair section, 1 for movetext section.
155  * @returns the result of the invoked parser's function.
156  */
157 template <typename TInput, typename TVisitor>
158 bool parse_token(char ch, TInput& input, TVisitor& parser, int& section) {
159  switch (ch) {
160  case ' ': // self terminating
161  case '\t': // self terminating
162  case '\v': // self terminating
163  case '\r': // self terminating
164  return true;
165 
166  case '\n': // self terminating
167  return parser.visitPGN_EndOfLine();
168 
169  case '.': // self terminating
170  return true;
171 
172  case '<': // self terminating
173  return true;
174 
175  case '>': // self terminating
176  return true;
177 
178  case '*': // self terminating
179  return parser.visitPGN_ResultFinal('*');
180 
181  case '(': // self terminating
182  return parser.visitPGN_VariationStart();
183 
184  case ')': // self terminating
185  return parser.visitPGN_VariationEnd();
186 
187  case '[': // --> ']', can span multiple lines
188  if (section <= 0) {
189  section = 0;
190  auto skip_spaces = [&]() {
191  auto spaces = input.read_while(is_PGNwhitespace);
192  while (spaces.first != spaces.second) {
193  if (*spaces.first++ == '\n')
194  parser.visitPGN_EndOfLine();
195  }
196  };
197  skip_spaces();
198  auto tag = input.read_while(is_PGNsymbol);
199 
200  skip_spaces();
201  auto value = input.read_until(']');
202 
203  // Remove the " char at the start and deal with the special case of
204  // a ] char inside the string token.
205  if (value.first != value.second && *value.first == '"') {
206  auto is_terminated = [&]() {
207  for (auto it = value.first; it != value.second; ++it) {
208  if (*it == '"')
209  return true;
210  if (*it == '\\' && ++it == value.second)
211  break;
212  }
213  return false;
214  };
215  ++value.first;
216  while (!input.last_column() && !is_terminated()) {
217  value.second = input.read_until(']').second;
218  }
219  }
220  // trim right
221  while (value.first != value.second) {
222  auto last_ch = *--value.second;
223  if (last_ch == '"') {
224  break;
225  }
226  if (!is_PGNwhitespace(last_ch)) {
227  ++value.second;
228  break;
229  }
230  if (last_ch == '\n')
231  parser.visitPGN_EndOfLine();
232  }
233  return parser.visitPGN_TagPair(tag, value);
234  }
235  input.sungetc();
236  parser.visitPGN_inputUnexpectedPGNHeader();
237  return false;
238 
239  case '{': // --> '}', can span multiple lines
240  return parser.visitPGN_Comment(input.read_until('}'));
241 
242  case ';': // --> '\n'
243  return parser.visitPGN_Comment(input.read_line());
244 
245  case '%': // --> '\n', only if "appearing in the first column of a line"
246  if (input.first_column()) {
247  return parser.visitPGN_Escape(input.read_line());
248  }
249  return parser.visitPGN_Unknown(
250  input.read_token([](char c) { return c == '%'; }));
251 
252  case '$': // terminated just prior to the first non-digit character
253  return parser.visitPGN_NAG(input.read_token(is_PGNdigit));
254 
255  case '?': // Suffix annotations: "!", "?", "!!", "!?", "?!", and "??"
256  case '!': // "At most one such suffix annotation may appear per move"
257  return parser.visitPGN_Suffix(
258  input.read_token([](char c) { return c == '!' || c == '?'; }));
259  }
260 
261  // "A symbol token is terminated just prior to the first non-symbol
262  // character following the symbol character sequence."
263  auto tok = input.read_token(is_PGNsymbol);
264  bool epd = (section < 0 && std::count(tok.first, tok.second, '/') == 7);
265  section = 1;
266 
267  if (epd) {
268  tok.second = input.read_line().second;
269  parser.visitPGN_EPD(tok);
270  return false;
271  }
272 
273  auto notdigit = std::find_if_not(tok.first, tok.second, is_PGNdigit);
274  if (notdigit == tok.first)
275  return parser.visitPGN_SANMove(tok);
276 
277  if (notdigit == tok.second)
278  return parser.visitPGN_MoveNum(tok);
279 
280  if (auto result = is_PGNtermination(tok))
281  return parser.visitPGN_ResultFinal(result);
282 
283  return parser.visitPGN_Unknown(tok);
284 }
285 
286 class InputMemory {
287  const char* const begin_;
288  const char* const end_;
289  const char* it_;
290 
291 public:
292  InputMemory(const char* begin, const char* end)
293  : begin_(begin), end_(end), it_(begin) {}
294 
295  /// Reads one character and advances the input sequence by one character.
296  char sbumpc() {
297  assert(it_ != end_);
298  return *it_++;
299  };
300 
301  /// Makes the most recently extracted character available again.
302  void sungetc() {
303  assert(it_ != begin_ && it_ != end_);
304  --it_;
305  }
306 
307  /// Returns true if there are no chars available.
308  bool eof() const { return it_ == end_; };
309 
310  /// Returns the number of chars read.
311  std::size_t n_read() const { return std::distance(begin_, it_); }
312 
313  /// Returns true if the most recently extracted character was the first
314  /// character of the line.
315  bool first_column() const { return (n_read() < 2 || *(it_ - 2) == '\n'); };
316 
317  /// Returns true if the most recently extracted character was the last
318  /// character of the line.
319  bool last_column() const { return eof() || *it_ == '\n' || *it_ == '\r'; }
320 
321  /// Returns the range of chars: [curr_char, '\n').
322  /// The '\n' char is left as the next character to extract.
323  std::pair<const char*, const char*> read_line() {
324  auto first = it_;
325  it_ = std::find(it_, end_, '\n');
326  return {first, it_};
327  }
328 
329  /// Returns the range of chars: [curr_char, delim).
330  /// The delim char is skipped.
331  std::pair<const char*, const char*> read_until(char delim) {
332  auto first = it_;
333  it_ = std::find(it_, end_, delim);
334  auto second = (it_ == end_) ? it_ : it_++;
335  return {first, second};
336  }
337 
338  /// Returns the range of chars: [curr_char, cond == true].
339  template <typename Cond>
340  std::pair<const char*, const char*> read_while(Cond cond) {
341  auto first = it_;
342  it_ = std::find_if_not(it_, end_, cond);
343  return {first, it_};
344  }
345 
346  /// Returns the range of chars: [last_extracted_char, cond == true].
347  /// cond is not applied to last_extracted_char.
348  template <typename Cond>
349  std::pair<const char*, const char*> read_token(Cond cond) {
350  assert(it_ != begin_);
351  auto first = it_ - 1;
352  it_ = std::find_if_not(it_, end_, cond);
353  return {first, it_};
354  }
355 };
356 
357 } // namespace pgn_impl
358 
359 namespace pgn {
360 
361 /**
362  * Read a PGN game from memory, grouping characters in tokens and dispatching
363  * them to a PGN parser.
364  * @param input: the memory range containing the PGN game.
365  * @param parser: will receive the tokens via visitPGN_* functions.
366  * Parsing is aborted if it returns false.
367  * @returns a std::pair containing the number of chars parsed, and true if at
368  * least a tag-pair token or a symbol token was dispatched.
369  */
370 template <typename TVisitor>
371 std::pair<std::size_t, bool> parse_game(pgn_impl::InputMemory input,
372  TVisitor&& parser) {
373  int section = -1;
374  do {
375  if (input.eof()) {
376  if (section >= 0)
377  parser.visitPGN_inputEOF();
378  break;
379  }
380  } while (pgn_impl::parse_token(input.sbumpc(), input, parser, section));
381 
382  return {input.n_read(), section >= 0};
383 }
384 
385 /**
386  * Normalize white spaces and converts Latin-1 chars to UTF-8 sequences.
387  *
388  * The original PGN standard used a subset of ISO 8859/1 (Latin 1):
389  * "Code value from 0 to 126 are the standard ASCII character set."
390  * "Code value from 127 to 191 are not used for PGN data representation."
391  * "Code value from 192 to 255 are mostly alphabetic printing characters with
392  * various diacritical marks; their use is encouraged for those languages
393  * that require such characters."
394  * However this do not allow internationalization for comments and names
395  * (players, sites, etc...); the common UTF-8 is a superior alternative.
396  * @param unescape: if true converts \\ to \ and \" to ".
397  * @param str: the string to be normalized.
398  * @param pos: start of the substring of @e str that will be normalized.
399  * @returns the number of '\n' chars in @e str.
400  */
401 template <bool unescape = false, typename TString>
402 std::size_t normalize(TString& str, std::size_t pos) {
403  std::size_t n_newlines = 0;
404  for (std::size_t i = pos, n = str.size(); i < n; ++i) {
405  unsigned char ch = str[i];
406  // An invalid UTF-8 sequence is considered a Latin1 char and converted.
407  if (ch > 0xBF) {
408  unsigned char nxt = (i + 1 != n) ? str[i + 1] : 0;
409  if (nxt < 0x80 || nxt > 0xBF) {
410  str[i] = static_cast<unsigned char>(ch & 0xBF);
411  str.insert(str.begin() + i, static_cast<unsigned char>(0xC3));
412  ++i;
413  ++n;
414  }
415  } else if (ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v') {
416  if (ch == '\n')
417  ++n_newlines;
418 
419  // Tab and new line characters are removed if there is an adjacent
420  // space, or converted to a normal space otherwise.
421  if (i == pos || // First char
422  (i + 1) == n || // Last char
423  str[i - 1] == ' ' || // Preceded by a space
424  pgn_impl::is_PGNwhitespace(str[i + 1])) // Followed by a space
425  {
426  str.erase(i, 1);
427  --i;
428  --n;
429  } else {
430  str[i] = ' ';
431  }
432  } else if (unescape && ch == '\\' && i + 1 != n) {
433  // "A quote inside a string is represented by the backslash
434  // immediately followed by a quote. A backslash inside a string is
435  // represented by two adjacent backslashes."
436  if (str[i + 1] == '\\' || str[i + 1] == '"') {
437  str.erase(i, 1);
438  --n;
439  }
440  }
441  }
442  return n_newlines;
443 }
444 
445 /**
446  * Escape quote and backslash chars according to the PGN standard:
447  * "A quote inside a string is represented by the backslash immediately followed
448  * by a quote. A backslash inside a string is represented by two adjacent
449  * backslashes."
450  * @param str: the string containing the chars to be escaped.
451  * @param pos: start of the substring of @e str to be processed.
452  */
453 template <typename TString> void escape_string(TString& str, std::size_t pos) {
454  auto it = str.begin() + pos;
455  while (true) {
456  it = std::find_if(it, str.end(),
457  [](char ch) { return ch == '\\' || ch == '\"'; });
458  if (it != str.end())
459  it = str.insert(it, '\\') + 2;
460  else
461  break;
462  }
463 }
464 
465 /**
466  * Trim leading and trailing white spaces.
467  * @param str: the string to trim.
468  * @returns the number of '\n' chars in @e str.
469  */
470 template <typename TView> std::size_t trim(TView& str) {
471  std::size_t n_newlines = 0;
472  auto is_space = [&n_newlines](char ch) {
473  if (ch == '\n') {
474  ++n_newlines;
475  } else if (ch != ' ' && ch != '\r' && ch != '\t' && ch != '\v') {
476  return false;
477  }
478  return true;
479  };
480  str.first = std::find_if_not(str.first, str.second, is_space);
481 
482  using RevIt = std::reverse_iterator<decltype(str.first)>;
483  str.second =
484  std::find_if_not(RevIt(str.second), RevIt(str.first), is_space).base();
485 
486  return n_newlines;
487 }
488 
489 } // namespace pgn
490 
491 #endif // _PGN_LEXER_H
bool last_column() const
Returns true if the most recently extracted character was the last character of the line...
Definition: pgn_lexer.h:319
std::pair< std::size_t, bool > parse_game(pgn_impl::InputMemory input, TVisitor &&parser)
Read a PGN game from memory, grouping characters in tokens and dispatching them to a PGN parser...
Definition: pgn_lexer.h:371
section?sec?
Definition: tablebase.tcl:269
std::pair< const char *, const char * > read_until(char delim)
Returns the range of chars: [curr_char, delim).
Definition: pgn_lexer.h:331
std::pair< const char *, const char * > read_while(Cond cond)
Returns the range of chars: [curr_char, cond == true].
Definition: pgn_lexer.h:340
char is_PGNtermination(TView tok)
Checks if a token is the game termination marker.
Definition: pgn_lexer.h:127
InputMemory(const char *begin, const char *end)
Definition: pgn_lexer.h:292
char sbumpc()
Reads one character and advances the input sequence by one character.
Definition: pgn_lexer.h:296
std::size_t n_read() const
Returns the number of chars read.
Definition: pgn_lexer.h:311
int find(const char *filename)
find() - search for a database.
Definition: dbasepool.cpp:51
bool is_PGNdigit(unsigned char ch)
Checks if the given character is one of the 10 decimal digits: 0123456789.
Definition: pgn_lexer.h:101
void sungetc()
Makes the most recently extracted character available again.
Definition: pgn_lexer.h:302
bool is_PGNsymbol(unsigned char ch)
Checks if the given character is a PGN symbol.
Definition: pgn_lexer.h:88
bool eof() const
Returns true if there are no chars available.
Definition: pgn_lexer.h:308
std::size_t trim(TView &str)
Trim leading and trailing white spaces.
Definition: pgn_lexer.h:470
bool is_PGNwhitespace(unsigned char ch)
Checks if the given character is a white space ("white space characters include space, newline, and tab characters").
Definition: pgn_lexer.h:109
void escape_string(TString &str, std::size_t pos)
Escape quote and backslash chars according to the PGN standard: "A quote inside a string is represent...
Definition: pgn_lexer.h:453
constexpr unsigned long long init_symbol_map(unsigned elem)
Creates a 128 bits bitmap of PGN symbol characters.
Definition: pgn_lexer.h:52
std::size_t normalize(TString &str, std::size_t pos)
Normalize white spaces and converts Latin-1 chars to UTF-8 sequences.
Definition: pgn_lexer.h:402
std::pair< const char *, const char * > read_token(Cond cond)
Returns the range of chars: [last_extracted_char, cond == true].
Definition: pgn_lexer.h:349
Definition: pgn_lexer.h:359
bool first_column() const
Returns true if the most recently extracted character was the first character of the line...
Definition: pgn_lexer.h:315
std::pair< const char *, const char * > read_line()
Returns the range of chars: [curr_char, &#39; &#39;).
Definition: pgn_lexer.h:323
bool parse_token(char ch, TInput &input, TVisitor &parser, int &section)
Read a token and dispatch it to a PGN parser.
Definition: pgn_lexer.h:158