Scid  4.7.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
spellchk.h
Go to the documentation of this file.
1 /*
2 # Copyright (C) 2015 Fulvio Benini
3 
4 * This file is part of Scid (Shane's Chess Information Database).
5 *
6 * Scid is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation.
9 *
10 * Scid is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with Scid. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #ifndef SPELLCHK_H
20 #define SPELLCHK_H
21 
22 #include "namebase.h"
23 #include "date.h"
24 #include <string>
25 #include <vector>
26 #include <utility>
27 #include <algorithm>
28 #ifdef SPELLCHKVALIDATE
29 #include <fstream>
30 #endif
31 
32 /*
33 * A "spelling" file contains the correct names for players, events, sites and rounds.
34 * Optionally it can provide further informations for players like elo, birthdate, etc..
35 * See the header of spelling.ssp for a more detailed description of the format.
36 */
37 
38 
39 /**
40  * class NameNormalizer - apply general corrections to a name
41  *
42  * Spelling files can provide general corrections in the form:
43  * %Prefix "wrong prefix" "correct prefix"
44  * %Infix "wrong suffix" "correct suffix"
45  * %Suffix "wrong suffix" "correct suffix"
46  *
47  * Example:
48  * %Prefix "II " "2. "
49  * %Infix "3rd " "3. "
50  * %Suffix "(Italy)" "ITA"
51  * "II champ 3rd II 3rd (Italy) (Italy)" --> "2. champ 3. II 3. (Italy) ITA"
52  */
54  typedef std::vector< std::pair<std::string,std::string> > Cont;
55  Cont prefix_;
56  Cont infix_ ;
57  Cont suffix_;
58 
59 public:
60  /**
61  * normalize() - correct a name
62  * @name: the name to be corrected
63  *
64  * Return: count of corrections applied
65  */
66  size_t normalize(std::string* name) const {
67  size_t corrections = 0;
68  Cont::const_iterator it;
69 
70  for (it = prefix_.begin(); it != prefix_.end(); it++) {
71  const std::string& s = it->first;
72  if (name->compare(0, s.length(), s) == 0) {
73  corrections++;
74  name->replace(0, s.length(), it->second);
75  break;
76  }
77  }
78 
79  for (it = infix_.begin(); it != infix_.end(); it++) {
80  const std::string& s = it->first;
81  size_t pos = name->find(s);
82  while (pos != std::string::npos) {
83  corrections++;
84  name->replace(pos, s.length(), it->second);
85  pos = name->find(s, pos + it->second.length());
86  }
87  }
88 
89  for (it = suffix_.begin(); it != suffix_.end(); it++) {
90  const std::string& s = it->first;
91  if (name->length() < s.length()) continue;
92  size_t pos = name->length() - s.length();
93  if (name->compare(pos, s.length(), s) == 0) {
94  corrections++;
95  name->replace(pos, s.length(), it->second);
96  break;
97  }
98  }
99 
100  return corrections;
101  }
102 
103  /**
104  * add*fix() - add a general correction
105  *
106  * Adds a general prefix, infix or suffix correction.
107  * Syntax for @e s is:
108  * %Suffix "wrong suffix" "correct suffix"
109  * Return: OK if successful
110  */
111  errorT addPrefix(const char* s) { return add(prefix_, s); }
112  errorT addInfix (const char* s) { return add(infix_, s); }
113  errorT addSuffix(const char* s) { return add(suffix_,s); }
114 
115 private:
116  errorT add(Cont& v, const char* s) {
117  ASSERT(s != 0);
118  std::vector<size_t> parse;
119  for (size_t i=0; *(s+i) != 0; i++) {
120  if (*(s+i) == '"') parse.push_back(i);
121  }
122  if (parse.size() != 4) return ERROR_CorruptData;
123  parse[0] += 1; //skip "
124  parse[1] -= parse[0]; //n_chars
125  if (parse[1] == 0) return ERROR_CorruptData;
126  parse[2] += 1; //skip "
127  parse[3] -= parse[2]; //n_chars
128  v.push_back(std::make_pair(
129  std::string(s + parse[0], parse[1]),
130  std::string(s + parse[2], parse[3])
131  ));
132  return OK;
133  }
134 };
135 
136 
137 /**
138  * class PlayerElo - elo ratings of a player
139  *
140  * Spelling files can provide elo ratings of a player in the form:
141  * %Elo YEAR:ELO_1PERIOD,ELO_2PERIOD,ELO_3PERIOD,... YEAR:ELO_1PERIOD,...
142  */
143 class PlayerElo {
144  std::vector< std::pair<uint16_t, eloT> > elo_;
145 
146 public:
147  void AddEloData(const char* str);
148 
149  eloT getElo (dateT date) const {
150  uint year = date_GetYear (date);
151  auto itBegin = std::find_if(elo_.begin(), elo_.end(),
152  [&](const std::pair<uint16_t, eloT>& e) {
153  return e.first == year;
154  });
155  auto itEnd = std::find_if(itBegin, elo_.end(),
156  [&](const std::pair<uint16_t, eloT>& e) {
157  return e.first != year;
158  });
159 
160  size_t n = std::distance(itBegin, itEnd);
161  if (n == 0) return 0; // No data for that year
162 
163  uint month = date_GetMonth (date);
164  if (month == 0 || month > 12) month = 0;
165  else month -= 1;
166 
167  size_t idx;
168  if (year == 2009 && n == 5) {
169  //2 trimonthly + 3 bimonthly
170  idx = (month < 6) ? month / 3 : (month - 2)/2;
171 
172  } else if (year == 2012 && n == 9) {
173  //3 bimonthly + 6 monthly
174  idx = (month < 6) ? month / 2 : month - 3;
175 
176  } else if (year > 2012) {
177  // monthly
178  if (month >= n) return 0;
179  idx = month;
180 
181  } else {
182  idx = month * n / 12;
183  }
184 
185  return (itBegin + idx)->second;
186  }
187 
188 #ifdef SPELLCHKVALIDATE
189  std::string isValid() const {
190  for (size_t i=1, n=elo_.size(); i < n; i++) {
191  if (elo_[i].first < elo_[i -1].first) return "unsorted";
192  }
193 
194  auto count = [this](uint year) {
195  return std::count_if(this->elo_.begin(), this->elo_.end(),
196  [&](const std::pair<uint16_t, eloT>& e) { return e.first == year; });
197  };
198 
199  auto expected = [](uint year) {
200  if (year < 1990) return 1;
201  if (year < 2001) return 2;
202  if (year < 2009) return 4;
203  if (year < 2010) return 5;
204  if (year < 2012) return 6;
205  if (year < 2013) return 9;
206  return 12;
207  };
208 
209  for (uint y=1970; y<2015; y++) {
210  auto n = count(y);
211  if (n == 0) continue;
212  if (n != expected(y))
213  return std::to_string(y) + ": " + std::to_string(n) + "(" +
214  std::to_string(expected(y)) + ")";
215  }
216 
217  return std::string();
218  }
219 #endif
220 };
221 
222 
223 /**
224  * class PlayerInfo - player informations
225  *
226  * Spelling files can provide player informations like titles/gender,
227  * countries, highest elo, date of birth, date of death. For example:
228  * Polgar, Judit #gm+w HUN [2735] 1976
229  *
230  * Generic information can be provided in the form:
231  * %Bio This is a generic information
232  */
233 class PlayerInfo {
234  const char* comment_;
235  std::vector<const char*> bio_;
236 
237  friend class SpellChkLoader;
238  friend class SpellChecker;
239 
240 public:
241  PlayerInfo(const char* s) : comment_(s) {}
242  const char* getTitle() const;
243  const char* getLastCountry() const;
244  dateT getBirthdate() const;
245  dateT getDeathdate() const;
246  eloT getPeakRating() const;
247  const char* GetComment() const {
248  return (comment_ != 0) ? comment_ : "";
249  }
250 };
251 
252 
253 /**
254  * class SpellChecker - name spelling
255  *
256  * Read a spell file and allow to retrieve corrected names and players data.
257  * if SPELLCHKVALIDATE is defined also check the spell file for errors.
258  */
260  struct Idx {
261  std::string alias;
262  int32_t idx;
263 
264  Idx() {}
265  Idx(const std::string& a, int32_t i) : alias(a), idx(i) {}
266  bool operator<(const Idx& b) const { return alias < b.alias; }
267  bool operator<(const std::string& b) const { return alias < b; }
268  };
269  typedef std::vector<Idx>::const_iterator IdxIt;
270 
271  NameNormalizer general_[NUM_NAME_TYPES];
272  std::string excludeChars_[NUM_NAME_TYPES];
273  std::vector<Idx> idx_[NUM_NAME_TYPES];
274  std::vector<const char*> names_[NUM_NAME_TYPES];
275  std::vector<PlayerInfo> pInfo_;
276  std::vector<PlayerElo> pElo_;
277  char* staticStrings_;
278 
279  friend class SpellChkLoader;
280 
281 public:
283  free(staticStrings_);
284  }
285 
286  /**
287  * Create() - Create a new SpellChecker object
288  *
289  * Create a new SpellChecker reading from @e filename.
290  * It's the caller's responsibility to free the object with "delete".
291  * Return:
292  * - OK and a pointer to the new object
293  * - on error the ERROR_*CODE* and NULL
294  */
295  static std::pair<errorT, SpellChecker*> Create(const char* filename,
296  const Progress& progress) {
297  SpellChecker* res = new SpellChecker;
298  errorT err = res->read(filename, progress);
299  if (err != OK) {
300  delete res;
301  res = NULL;
302  }
303  return std::make_pair(err, res);
304  }
305 
306  /**
307  * find() - search for correct names
308  * @nt: the type of the name to be corrected
309  * @name: the name to be corrected
310  * @nMaxRes: max size of the returned vector
311  *
312  * Return: a vector of correct names.
313  * @name will be normalized removing excludeChars_[@nt].
314  * If an exact match for normalized @name is found the result vector will
315  * contain only the corresponding correct name, otherwise will contain all
316  * the correct names that have @name as a prefix.
317  */
318  std::vector<const char*> find(const nameT& nt, const char* name, uint nMaxRes = 10) const {
319  ASSERT(nt < NUM_NAME_TYPES);
320  ASSERT(name != 0);
321  std::vector<const char*> res;
322  std::pair<IdxIt, IdxIt> it;
323  if (nt != NAME_PLAYER) it = idxFind(nt, name);
324  else it = idxFindPlayer(name);
325  for (; it.first != it.second && res.size() < nMaxRes; it.first++) {
326  const char* corrected = names_[nt][it.first->idx];
327  if (std::find(res.begin(), res.end(), corrected) == res.end()) {
328  res.push_back(corrected);
329  }
330  }
331  return res;
332  }
333 
334  const NameNormalizer& getGeneralCorrections(const nameT& nt) const {
335  ASSERT(nt < NUM_NAME_TYPES);
336  return general_[nt];
337  }
338 
339  /**
340  * SpellChecker::getPlayerInfo() - get extra info about a player
341  *
342  * Get extra data like titles/gender, countries, highest elo,
343  * date of birth, date of death or biographic informations.
344  * Return:
345  * - on success a pointer to a valid PlayerInfo object containing
346  * the available data. If @bio != 0 the vector is filled with
347  * the available biographic informations.
348  * - if @name is not found or is ambiguous (match multiple players)
349  * returns NULL and @bio is untouched.
350  */
351  const PlayerInfo* getPlayerInfo(const char* name,
352  std::vector<const char*>* bio = 0) const {
353  ASSERT(name != 0);
354  IdxIt it = idxFindPlayerUnambiguous(name);
355  if (it == idx_[NAME_PLAYER].end()) return 0; // not found
356 
357  if (bio != 0) *bio = pInfo_[it->idx].bio_;
358  return &(pInfo_[it->idx]);
359  }
360 
361  const PlayerElo* getPlayerElo(const char* name) const {
362  ASSERT(name != 0);
363  if (!hasEloData()) return 0;
364  IdxIt it = idxFindPlayerUnambiguous(name);
365  if (it == idx_[NAME_PLAYER].end()) return 0; // not found
366  return &(pElo_[it->idx]);
367  }
368 
369  bool hasEloData () const {
370  return pElo_.size() != 0;
371  }
372 
373  size_t numCorrectNames(const nameT& nt) const {
374  ASSERT(nt < NUM_NAME_TYPES);
375  return names_[nt].size();
376  }
377 
378 private:
379  SpellChecker() : staticStrings_(NULL) {}
380  SpellChecker(const SpellChecker&);
381  SpellChecker& operator=(const SpellChecker&);
382 
383  errorT read(const char* filename, const Progress& progress);
384 
385  std::string normalizeAndTransform(const nameT& nt, const char* s) const {
386  std::string res;
387  for (const char* i = s; *i != 0; i++) {
388  if (excludeChars_[nt].find(*i) != std::string::npos) continue;
389 
390  res += *i;
391  }
392  return res;
393  }
394 
395  std::pair<IdxIt, IdxIt> idxFind(const nameT& nt, const char* prefix) const {
396  std::pair<IdxIt, IdxIt> res;
397  std::string s = normalizeAndTransform(nt, prefix);
398  res.first = std::lower_bound(idx_[nt].begin(), idx_[nt].end(), s);
399  for (res.second = res.first; res.second != idx_[nt].end(); res.second++) {
400  if (res.second->alias.compare(0, s.length(), s) != 0) break;
401  if (res.second->alias == s) return std::make_pair(res.second, res.second +1);
402  }
403  return res;
404  }
405 
406  std::pair<IdxIt, IdxIt> idxFindPlayer(const char* prefix) const {
407  std::pair<IdxIt, IdxIt> res = idxFind(NAME_PLAYER, prefix);
408  if (res.first == res.second) {
409  // For spelling of player names (not other types), Scid will also try
410  // to move the text after the last space in the name to the start of
411  // the name for correction purposes, when it cannot find a correction.
412  // This is done to correct names where the surname is last.
413  std::string s = prefix;
414  size_t pos = s.rfind(' ');
415  if (pos != std::string::npos) {
416  std::string inv = s.substr(pos);
417  inv.append(s, 0, pos);
418  return idxFind(NAME_PLAYER, inv.c_str());
419  }
420  }
421  return res;
422  }
423 
424  IdxIt idxFindPlayerUnambiguous(const char* name) const {
425  std::pair<IdxIt, IdxIt> it = idxFindPlayer(name);
426  if (it.first == it.second) return idx_[NAME_PLAYER].end();
427 
428  for (IdxIt i = it.first; i != it.second; i++) {
429  if (i->idx != it.first->idx) //ambiguous
430  return idx_[NAME_PLAYER].end();
431  }
432  return it.first;
433  }
434 
435 
436 #ifndef SPELLCHKVALIDATE
437  class SpellChkValidate {
438  public:
439  SpellChkValidate(const char*, const SpellChecker&) {}
440  void ignoredLine(const char*) {}
441  void idxDuplicates(const nameT&) {}
442  void checkEloData() {}
443  };
444 #else
445  class SpellChkValidate {
446  const SpellChecker& spell_;
447  std::ofstream f_;
448 
449  public:
450  SpellChkValidate(const char* spellfile, const SpellChecker& sp) : spell_(sp) {
451  f_.open(spellfile + std::string(".validate"));
452  }
453  void ignoredLine(const char* line) {
454  f_ << "Ignored line:" << std::endl;
455  f_ << line << std::endl;
456  f_ << std::endl;
457  }
458  static bool cmpIdxAlias(const Idx& a, const Idx& b) {
459  return a.alias == b.alias;
460  }
461  void idxDuplicates(const nameT& nt) {
462  IdxIt it = spell_.idx_[nt].begin();
463  IdxIt it_end = spell_.idx_[nt].end();
464  for (;;) {
465  it = std::adjacent_find(it, it_end, cmpIdxAlias);
466  if (it == it_end) return;
467 
468  IdxIt it_endDuplicates = std::upper_bound(it, it_end, *it);
469  f_ << "Duplicate hash: " << it->alias << std::endl;
470  for(; it != it_endDuplicates; it++) {
471  f_ << spell_.names_[nt][it->idx];
472  f_ << " - Idx:" << it->idx << std::endl;
473  }
474  f_ << std::endl;
475  }
476  }
477  void checkEloData() {
478  for (size_t i=0, n = spell_.pElo_.size(); i < n; i++) {
479  std::string s = spell_.pElo_[i].isValid();
480  if (! s.empty()) {
481  f_ << "Elo error: " << s << " --- ";
482  f_ << spell_.names_[NAME_PLAYER][i] << std::endl;
483  }
484  }
485  }
486  };
487 #endif
488 
489 };
490 
491 
492 #endif
uint date_GetYear(dateT date)
Definition: date.h:52
const errorT OK
Definition: error.h:23
size_t normalize(std::string *name) const
Definition: spellchk.h:66
uint dateT
Definition: common.h:147
#define ASSERT(f)
Definition: common.h:59
errorT addPrefix(const char *s)
add*fix() - add a general correction
Definition: spellchk.h:111
class PlayerElo - elo ratings of a player
Definition: spellchk.h:143
Definition: misc.h:63
names
Definition: tablebase.tcl:257
class SpellChecker - name spelling
Definition: spellchk.h:259
int find(const char *filename)
find() - search for a database.
Definition: dbasepool.cpp:51
errorT addInfix(const char *s)
Definition: spellchk.h:112
eloT getElo(dateT date) const
Definition: spellchk.h:149
uint32_t uint
Definition: common.h:91
size_t numCorrectNames(const nameT &nt) const
Definition: spellchk.h:373
ushort eloT
Definition: common.h:164
const NameNormalizer & getGeneralCorrections(const nameT &nt) const
Definition: spellchk.h:334
const errorT ERROR_CorruptData
Definition: error.h:46
unsigned short errorT
Definition: error.h:20
const PlayerElo * getPlayerElo(const char *name) const
Definition: spellchk.h:361
std::vector< const char * > find(const nameT &nt, const char *name, uint nMaxRes=10) const
Definition: spellchk.h:318
bool hasEloData() const
Definition: spellchk.h:369
class NameNormalizer - apply general corrections to a name
Definition: spellchk.h:53
const PlayerInfo * getPlayerInfo(const char *name, std::vector< const char *> *bio=0) const
Definition: spellchk.h:351
class SpellChkLoader - load data into a SpellChecker object
Definition: spellchk.cpp:133
unsigned nameT
Definition: common.h:153
const char * GetComment() const
Definition: spellchk.h:247
PlayerInfo(const char *s)
Definition: spellchk.h:241
errorT addSuffix(const char *s)
Definition: spellchk.h:113
uint date_GetMonth(dateT date)
Definition: date.h:61
static std::pair< errorT, SpellChecker * > Create(const char *filename, const Progress &progress)
Create() - Create a new SpellChecker object.
Definition: spellchk.h:295
class PlayerInfo - player informations
Definition: spellchk.h:233