Scid  4.6.5
spellchk.cpp
Go to the documentation of this file.
1 /*
2 * Copyright (C) 2015 Fulvio Benini
3 * Copyright (c) 2001-2003 Shane Hudson (2nd part of the file)
4 
5 * This file is part of Scid (Shane's Chess Information Database).
6 *
7 * Scid is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation.
10 *
11 * Scid is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Scid. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include "spellchk.h"
21 #include "date.h"
22 #include "filebuf.h"
23 #include "misc.h"
24 
25 
26 namespace {
27 
28 enum InfoType {
29  SPELL_SECTIONSTART,
30  SPELL_NEWNAME, SPELL_ALIAS, SPELL_PREFIX, SPELL_INFIX, SPELL_SUFFIX,
31  SPELL_BIO, SPELL_ELO,
32  SPELL_EMPTY, SPELL_OLDBIO, SPELL_UNKNOWN
33 };
34 
35 struct Parser {
36  char* name;
37  char* extra;
38  enum InfoType type;
39 
40  Parser(char* line);
41 };
42 
43 /**
44  * Parser::Parser() - Parse a "spelling" line.
45  *
46  * Fill data members doing the following tasks:
47  * - separate the optional comment (a comment starts with '#' and
48  * extend to the end of the line) from the name data.
49  * - remove leading and trailing white-spaces.
50  * - identify the type of data
51  */
52 Parser::Parser(char* line) {
53  ASSERT(line != 0);
54 
55  extra = strchr(line, '#');
56  if (extra != NULL) {
57  // Make [line, extra) a null terminated string
58  *extra++ = 0;
59  }
60  name = (char*) strTrimLeft(line);
62 
63  type = SPELL_UNKNOWN;
64  switch (*name) {
65  case 0:
66  type = SPELL_EMPTY;
67  break;
68  case '>':
69  type = SPELL_OLDBIO;
70  break;
71  case '=':
72  type = SPELL_ALIAS;
73  // Skip over "=" and spaces:
74  name++;
75  while (*name == ' ') { name++; }
76  break;
77  case '%':
78  if (strIsPrefix("%Elo ", name)) {
79  type = SPELL_ELO;
80  name += 5; //Skip "%Elo "
81  } else if (strIsPrefix("%Bio ", name)) {
82  type = SPELL_BIO;
83  name += 5; //Skip "%Bio "
84  } else if (strIsPrefix("%Prefix ", name)) {
85  type = SPELL_PREFIX;
86  } else if (strIsPrefix("%Infix ", name)) {
87  type = SPELL_INFIX;
88  } else if (strIsPrefix("%Suffix ", name)) {
89  type = SPELL_SUFFIX;
90  }
91  break;
92  case '@':
93  type = SPELL_SECTIONSTART;
94  name++; //Skip '@'
95  // Now check if there is a list of characters to exclude from
96  // comparisons, e.g: @PLAYER ", .-"
97  // would indicate to exclude dots, commas, spaces and dashes.
98  extra = strchr(name, '"');
99  if (extra != NULL) {
100  char* end = strchr(++extra, '"');
101  if (end != NULL) {
102  *end = 0;
103  } else {
104  extra = NULL;
105  }
106  }
107  break;
108  default:
109  type = SPELL_NEWNAME;
110  if (extra != NULL) {
111  // Spelling files can provide player informations like titles/gender,
112  // countries, highest elo, date of birth, date of death. For example:
113  // Polgar, Judit #gm+w HUN [2735] 1976
114  strTrimRight(extra);
115  }
116  }
117 }
118 
119 } // End of anonymous namespace
120 
121 
122 /**
123  * class SpellChkLoader - load data into a SpellChecker object
124  *
125  * This class take parsed "spelling" data and store it into the right
126  * data members of the associated SpellChecker object.
127  * Reading from a "spelling" file is not stateless and the Parser object
128  * cannot contain all the necessary data: a SpellChkLoader object keep track
129  * of the current nameT section and the current correct name.
130  * The SpellChkValidate object is used to log ignored data, usually
131  * caused by typos like "@Eol" or "@Preffix".
132  */
134  SpellChecker& sp_;
135  SpellChecker::SpellChkValidate& validate_;
136  nameT nt_;
137  int32_t nameIdx_;
138 
139 public:
140  SpellChkLoader(SpellChecker& sp, SpellChecker::SpellChkValidate& v)
141  : sp_(sp), validate_(v), nt_(NAME_INVALID), nameIdx_(-1) {
142  }
143 
144  errorT load(const Parser& data, bool* keepBuffer) {
145  ASSERT(keepBuffer != 0);
146  *keepBuffer = false;
147 
148  switch (data.type) {
149  case SPELL_SECTIONSTART:
150  nt_ = NameBase::NameTypeFromString(data.name);
152  if (data.extra != NULL) {
153  sp_.excludeChars_[nt_] = data.extra;
154  } else {
155  sp_.excludeChars_[nt_].clear();
156  }
157  nameIdx_ = -1;
158  return OK;
159  case SPELL_NEWNAME:
160  case SPELL_ALIAS:
161  case SPELL_PREFIX:
162  case SPELL_INFIX:
163  case SPELL_SUFFIX:
164  return nameSection(data, keepBuffer);
165  case SPELL_BIO:
166  case SPELL_ELO:
167  return playerInfo(data, keepBuffer);
168  case SPELL_EMPTY:
169  return OK;
170  case SPELL_OLDBIO:
171  case SPELL_UNKNOWN:
172  validate_.ignoredLine(data.name);
173  return OK;
174  }
175 
176  ASSERT(0);
177  return ERROR_CorruptData;
178  }
179 
180 private:
181  errorT nameSection(const Parser& data, bool* keepBuffer) {
182  // Must be in a valid name section
184 
185  switch (data.type) {
186  case SPELL_NEWNAME:
187  *keepBuffer = true;
188  nameIdx_ = sp_.names_[nt_].size();
189  sp_.names_[nt_].push_back(data.name);
190  if (nt_ == NAME_PLAYER) {
191  sp_.pInfo_.push_back(data.extra);
192  }
193  // go in SPELL_ALIAS:
194  case SPELL_ALIAS:
195  if (nameIdx_ == -1) {
196  return ERROR_CorruptData;
197  } else {
198  sp_.idx_[nt_].push_back(SpellChecker::Idx(
199  sp_.normalizeAndTransform(nt_, data.name),
200  nameIdx_
201  ));
202  }
203  return OK;
204  case SPELL_PREFIX:
205  return sp_.general_[nt_].addPrefix(data.name);
206  case SPELL_INFIX:
207  return sp_.general_[nt_].addInfix(data.name);
208  case SPELL_SUFFIX:
209  return sp_.general_[nt_].addSuffix(data.name);
210  default:
211  ASSERT(0);
212  }
213 
214  return ERROR_CorruptData;
215  }
216 
217  errorT playerInfo(const Parser& data, bool* keepBuffer) {
218  // SPELL_BIO and SPELL_ELO are valid only for a PLAYER name
219  if (nt_ != NAME_PLAYER || nameIdx_ == -1) return ERROR_CorruptData;
220 
221  if (data.type == SPELL_BIO) {
222  *keepBuffer = true;
223  sp_.pInfo_[nameIdx_].bio_.push_back(data.name);
224  } else {
225  ASSERT(data.type == SPELL_ELO);
226  // if necessary, add empty PlayerElo objects
227  sp_.pElo_.resize(nameIdx_ + 1);
228  sp_.pElo_[nameIdx_].AddEloData(data.name);
229  }
230 
231  return OK;
232  }
233 };
234 
235 
236 /**
237  * SpellChecker::read() - Read a "spelling" file.
238  *
239  * This functions tries to open the @filename file and to load the data
240  * into the SpellChecker object.
241  * The object must be empty. In practice the requirement is to not call
242  * this function twice, because this is the only non-const member function.
243  * If the function fails (result != OK) the object state is undefined
244  * and the only valid operation is to destroy the object.
245  * If SPELLCHKVALIDATE is defined, it also creates a @filename.validate log.
246  */
247 errorT SpellChecker::read(const char* filename, const Progress& progress)
248 {
249  ASSERT(filename != NULL);
250  ASSERT(staticStrings_ == NULL);
251 
252  // Open the file and get the file size.
253  Filebuf file;
254  std::streamsize fileSize = -1;
255  if (file.open(filename, std::ios::in | std::ios::binary | std::ios::ate) != 0) {
256  fileSize = file.pubseekoff(0, std::ios::cur, std::ios::in);
257  file.pubseekoff(0, std::ios::beg, std::ios::in);
258  }
259  if (fileSize == -1) return ERROR_FileOpen;
260 
261  SpellChkValidate validate(filename, *this);
262 
263  // Parse the file lines
264  staticStrings_ = (char*) malloc(fileSize + 1);
265  char* bEnd = staticStrings_ + fileSize + 1;
266  char* line = staticStrings_;
267  size_t nRead;
268  uint report_i = 0;
269  std::streamsize report_done = 0;
270  SpellChkLoader loader(*this, validate);
271  while ((nRead = file.getline(line, std::distance(line, bEnd))) != 0) {
272  report_done += nRead;
273  if ((++report_i % 10000) == 0) {
274  if (!progress.report(report_done, fileSize))
275  return ERROR_UserCancel;
276  }
277 
278  bool keepBuffer;
279  errorT err = loader.load(Parser(line), &keepBuffer);
280  if (err != OK) return err;
281 
282  if (keepBuffer) line += nRead;
283  }
284  if (report_done != fileSize || file.sgetc() != EOF) return ERROR_FileRead;
285 
286  // Success:
287  if (pElo_.size() > 0) {
288  // if necessary, add empty PlayerElo objects
289  pElo_.resize(pInfo_.size());
290  validate.checkEloData();
291  }
292 
293  #if CPP11_SUPPORT
294  // Free unused memory
295  char* shrink = (char*) realloc(staticStrings_, 1 + std::distance(staticStrings_,line));
296  if (shrink != NULL && shrink != staticStrings_) {
297  // Unlikely, but realloc() moved the memory: update the pointers.
298  const char* oldAddr = staticStrings_;
299  staticStrings_ = shrink;
300  for (nameT i=0; i < NUM_NAME_TYPES; i++) {
301  for (auto& e : (names_[i]))
302  e = staticStrings_ + std::distance(oldAddr, e);
303  }
304  for (auto& e : pInfo_) {
305  e.comment_ = staticStrings_ + std::distance(oldAddr, e.comment_);
306  for (auto& bio : e.bio_) {
307  bio = staticStrings_ + std::distance(oldAddr, bio);
308  }
309  }
310  }
311  #endif
312 
313  // Sort the index
314  for (nameT i=0; i < NUM_NAME_TYPES; i++) {
315  std::sort(idx_[i].begin(), idx_[i].end());
316  validate.idxDuplicates(i);
317  }
318  return OK;
319 }
320 
321 
322 //////////////////////////////////////////////////////////////////////
323 //
324 // FILE: spellchk.cpp
325 // SpellChecker class methods
326 //
327 // Part of: Scid (Shane's Chess Information Database)
328 // Version: 3.5
329 //
330 // Notice: Copyright (c) 2001-2003 Shane Hudson. All rights reserved.
331 //
332 // Author: Shane Hudson (sgh@users.sourceforge.net)
333 //
334 //////////////////////////////////////////////////////////////////////
335 
336 // Retrieve the list of Rating figures for given player (aka node) from the given (ssp) string
337 // The string is formatted as:
338 // [%Elo ]<year>:<<rating>|?>,...,<<rating>|?> [<year>:<<rating>|?>,...,<<rating>|?>...]
339 //
340 // The ratings are stored in a rating array for this player, in the order of appearance
341 // and without any assumption on the period that the rating refers to.
342 // This is accomplished by assuming that for all years the same number of rating figures
343 // could be given (see ELO_RATINGS_PER_YEAR above).
344 //
345 // The (external) algorithm to map ratings to actual periods must be able to cope with
346 // the holes that - as a consequence - will appear in the rating graph constructed here!
347 //
348 void PlayerElo::AddEloData(const char * str)
349 {
350  while (1) {
351  // Get the year in which the rating figures to follow were published
352  //
353  str = strTrimLeft (str);
354  if (! isdigit(static_cast<unsigned char>(*str))) { break; }
355  uint16_t year = strGetUnsigned (str);
356  str += 4;
357  if (*str != ':') { break; }
358  str++;
359 
360  // Now read all the ratings for this year:
361  //
362  eloT elo = 0;
363  while (1) {
364  if (isdigit(static_cast<unsigned char>(*str))) {
365  elo = strGetUnsigned (str);
366  str += 4;
367  } else if (*str == '?') {
368  elo = 0;
369  str++;
370  } else if (*str == ' ') {
371  break;
372  } else {
373  // Invalid data seen:
374  return;
375  }
376 
377  elo_.push_back(std::make_pair(year, elo));
378 
379  if (*str == ',') { str++; }
380  }
381  }
382 }
383 
384 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
385 // PlayerInfo::GetTitle:
386 // Extract the first title appearing in the player
387 // comment, and return it.
388 const char *
390 {
391  static const char * titles[] = {
392  "gm", "im", "fm",
393  "wgm", "wim", "wfm", "w",
394  "cgm", "cim", "hgm",
395  NULL
396  };
397  const char ** titlePtr = titles;
398 
399  const char* comment = GetComment();
400  if (*comment == 0) { return ""; }
401 
402  while (*titlePtr != NULL) {
403  if (strIsPrefix (*titlePtr, comment)) { return *titlePtr; }
404  titlePtr++;
405  }
406  return "";
407 }
408 
409 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
410 // PlayerInfo::GetLastCountry:
411 // Scan the player comment string for the country field (which
412 // is the second field, after the title), then return the
413 // last three letters in the country field, or the empty string
414 // if the country field is less than 3 characters long.
415 const char *
417 {
418  static char country[4];
419  country[0] = 0;
420 
421  const char* start = GetComment();
422  if (*start == 0) { return ""; }
423 
424  // Skip over the title field:
425  while (*start != ' ' && *start != 0) { start++; }
426  while (*start == ' ') { start++; }
427 
428  const char * end = start;
429  int length = 0;
430  while (*end != ' ' && *end != 0) { end++; length++; }
431  // Return the final three characters of the country field:
432  if (length >= 3) {
433  for (int i=0; i < 3; i++) { country[i] = start[length-3 + i]; }
434  country[3] = 0;
435  }
436  return country;
437 }
438 
439 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
440 // PlayerInfo::GetPeakRating:
441 // Scan the player comment string for the peak rating
442 // field (which is contained in brackets), convert it
443 // to an unsigned integer, and return it.
444 eloT
446 {
447  const char* s = GetComment();
448  if (*s == 0) { return 0; }
449 
450  while (*s != '[' && *s != 0) { s++; }
451  if (*s != '[') { return 0; }
452  s++;
453  return strGetUnsigned (s);
454 }
455 
456 
457 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
458 // PlayerInfo::GetBirthdate:
459 // Scan the player comment string for the birthdate
460 // field, convert it to a date, and return it.
461 dateT
463 {
464  const char* s = GetComment();
465  if (*s == 0) { return ZERO_DATE; }
466 
467  // Find the end-bracket character after the rating:
468  while (*s != ']' && *s != 0) { s++; }
469  if (*s != ']') { return ZERO_DATE; }
470  s++;
471  // Now skip over any spaces:
472  while (*s == ' ') { s++; }
473  if (*s == 0) { return ZERO_DATE; }
474  return date_EncodeFromString (s);
475 }
476 
477 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~
478 // PlayerInfo::GetDeathdate:
479 // Scan the player comment string for the deathdate
480 // field, convert it to a date, and return it.
481 dateT
483 {
484  const char* s = GetComment();
485  if (*s == 0) { return ZERO_DATE; }
486 
487  // Find the end-bracket character after the rating:
488  while (*s != ']' && *s != 0) { s++; }
489  if (*s != ']') { return ZERO_DATE; }
490  s++;
491  // Now skip over any spaces:
492  while (*s == ' ') { s++; }
493  // Now skip over the birthdate and dashes:
494  while (*s != 0 && *s != '-') { s++; }
495  while (*s == '-') { s++; }
496  if (*s == 0) { return ZERO_DATE; }
497  return date_EncodeFromString (s);
498 }
499 
500 //////////////////////////////////////////////////////////////////////
501 // EOF: spellchk.cpp
502 //////////////////////////////////////////////////////////////////////
const char * getLastCountry() const
Definition: spellchk.cpp:416
bool report(size_t done, size_t total) const
Definition: misc.h:136
const char * strTrimLeft(const char *target, const char *trimChars)
Definition: misc.cpp:347
const errorT OK
Definition: error.h:23
bool strIsPrefix(const char *prefix, const char *longStr)
Definition: misc.h:412
playerInfo?player?
Definition: pinfo.tcl:347
static nameT NameTypeFromString(const char *str)
Definition: namebase.cpp:370
uint dateT
Definition: common.h:155
#define ASSERT(f)
Definition: common.h:67
errorT addPrefix(const char *s)
add*fix() - add a general correction
Definition: spellchk.h:111
Definition: misc.h:124
names
Definition: tablebase.tcl:260
SpellChkLoader(SpellChecker &sp, SpellChecker::SpellChkValidate &v)
Definition: spellchk.cpp:140
uint fileSize(const char *name, const char *suffix)
Definition: misc.cpp:764
class SpellChecker - name spelling
Definition: spellchk.h:257
dateT getBirthdate() const
Definition: spellchk.cpp:462
const dateT ZERO_DATE
Definition: date.h:34
errorT addInfix(const char *s)
Definition: spellchk.h:112
const char * getTitle() const
Definition: spellchk.cpp:389
const errorT ERROR_FileRead
Definition: error.h:33
uint nameT
Definition: namebase.h:29
const errorT ERROR_UserCancel
Definition: error.h:27
uint strTrimRight(char *target, const char *trimChars)
Definition: misc.h:508
static bool IsValidNameType(nameT nt)
Definition: namebase.h:79
sort?type?
Definition: analysis.tcl:321
uint32_t uint
Definition: common.h:99
eloT getPeakRating() const
Definition: spellchk.cpp:445
ushort eloT
Definition: common.h:160
const errorT ERROR_CorruptData
Definition: error.h:46
uint32_t strGetUnsigned(const char *str)
Definition: misc.h:276
unsigned short errorT
Definition: error.h:20
Adds some helper functions to std::filebuf:
Definition: filebuf.h:35
class SpellChkLoader - load data into a SpellChecker object
Definition: spellchk.cpp:133
Extends the std:filebuf class with performance improvements.
dateT getDeathdate() const
Definition: spellchk.cpp:482
errorT load(const Parser &data, bool *keepBuffer)
Definition: spellchk.cpp:144
errorT addSuffix(const char *s)
Definition: spellchk.h:113
InfoType
Definition: spellchk.cpp:28
const errorT ERROR_FileOpen
Definition: error.h:31
size_t getline(char *str, size_t count)
Equivalent to std::fstream::getline, but faster (no sentry [27.7.2.1.3]).
Definition: filebuf.h:86
void AddEloData(const char *str)
Definition: spellchk.cpp:348
dateT date_EncodeFromString(const char *str)
Definition: date.h:130