Minsky
CSVParser.cc
Go to the documentation of this file.
1 /*
2  @copyright Steve Keen 2018
3  @author Russell Standish
4  This file is part of Minsky.
5 
6  Minsky is free software: you can redistribute it and/or modify it
7  under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  Minsky is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with Minsky. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include "minsky.h"
21 #include "CSVParser.h"
22 
23 #include "CSVParser.rcd"
24 #include "dataSpecSchema.rcd"
25 #include "dimension.rcd"
26 #include "nobble.h"
27 #include "minsky_epilogue.h"
28 
29 #ifdef _WIN32
30 #include <memoryapi.h>
31 #include <windows.h>
32 #else
33 #include <sys/mman.h>
34 #include <sys/resource.h>
35 #endif
36 
37 using namespace minsky;
38 using namespace std;
39 
40 #include <boost/type_traits.hpp>
41 #include <boost/tokenizer.hpp>
42 #include <boost/token_functions.hpp>
43 #include <boost/pool/pool.hpp>
44 
46 {
47  // pinched from boost::escape_list_separator, and modified to not throw
48  template <class Char,
49  class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
51 
52  private:
53  typedef std::basic_string<Char,Traits> string_type;
54  struct char_eq {
55  Char e_;
56  char_eq(Char e):e_(e) { }
57  bool operator()(Char c) {
58  return Traits::eq(e_,c);
59  }
60  };
64  bool last_;
65 
66  bool is_escape(Char e) {
67  const char_eq f(e);
68  return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
69  }
70  bool is_c(Char e) {
71  const char_eq f(e);
72  return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
73  }
74  bool is_quote(Char e) {
75  const char_eq f(e);
76  return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
77  }
78  template <typename iterator, typename Token>
79  void do_escape(iterator& next,iterator end,Token& tok) {
80  if (++next >= end)
81  // don't throw, but pass on verbatim
82  tok+=escape_.front();
83  if (Traits::eq(*next,'n')) {
84  tok+='\n';
85  return;
86  }
87  if (is_quote(*next)) {
88  tok+=*next;
89  return;
90  }
91  if (is_c(*next)) {
92  tok+=*next;
93  return;
94  }
95  if (is_escape(*next)) {
96  tok+=*next;
97  return;
98  }
99  // don't throw, but pass on verbatim
100  tok+=escape_.front()+*next;
101  }
102 
103  public:
104 
105  explicit EscapedListSeparator(Char e = '\\',
106  Char c = ',',Char q = '\"')
107  : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
108 
110  : escape_(e), c_(c), quote_(q), last_(false) { }
111 
112  void reset() {last_=false;}
113 
114  template <typename InputIterator, typename Token>
115  bool operator()(InputIterator& next,InputIterator end,Token& tok) {
116  bool bInQuote = false;
117  tok = Token();
118 
119  if (next >= end) {
120  next=end; // reset next in case it has adavanced beyond
121  if (last_) {
122  last_ = false;
123  return true;
124  }
125  return false;
126  }
127  last_ = false;
128  while (next < end) {
129  if (is_escape(*next)) {
130  do_escape(next,end,tok);
131  }
132  else if (is_c(*next)) {
133  if (!bInQuote) {
134  // If we are not in quote, then we are done
135  ++next;
136  // The last character was a c, that means there is
137  // 1 more blank field
138  last_ = true;
139  return true;
140  }
141  tok+=*next;
142  }
143  else if (is_quote(*next)) {
144  bInQuote=!bInQuote;
145  }
146  else {
147  tok += *next;
148  }
149  ++next;
150  }
151  return true;
152  }
153  };
154 }
156 
157 typedef boost::tokenizer<Parser> Tokenizer;
158 
160 {
161  char escape, quote;
162  SpaceSeparatorParser(char escape='\\', char sep=' ', char quote='"'):
163  escape(escape), quote(quote) {}
164  template <class I>
165  bool operator()(I& next, I end, std::string& tok)
166  {
167  tok.clear();
168  bool quoted=false;
169  while (next!=end)
170  {
171  if (*next==escape)
172  tok+=*(++next);
173  else if (*next==quote)
174  quoted=!quoted;
175  else if (!quoted && isspace(*next))
176  {
177  while (isspace(*next)) ++next;
178  return true;
179  }
180  else
181  tok+=*next;
182  ++next;
183  }
184  return !tok.empty();
185  }
186  void reset() {}
187 };
188 
189 namespace
190 {
192  struct Any: public any
193  {
194  Any()=default;
195  Any(const any& x): any(x), hash(x.hash()) {}
196  bool operator<(const Any& x) const {return static_cast<const any&>(*this)<x;}
197  bool operator==(const Any& x) const {return static_cast<const any&>(*this)==static_cast<const any&>(x);}
198  size_t hash;
199  };
200 
201  // slice label token map
202  template <class T>
203  class Tokens
204  {
205  unordered_map<string, T> tokens;
206  vector<const string*> tokenRefs;
207  string empty;
208  public:
209  T operator[](const string& x) {
210  auto i=tokens.find(x);
211  if (i==tokens.end())
212  {
213  i=tokens.emplace(x, tokenRefs.size()).first;
214  tokenRefs.push_back(&(i->first));
215  }
216  return i->second;
217  }
218  const string& operator[](T i) const {
219  if (i<tokenRefs.size()) return *tokenRefs[i];
220  return empty;
221  }
222  };
223 
224  using SliceLabelToken=uint32_t;
225  using Key=vector<SliceLabelToken, LibCAllocator<SliceLabelToken>>;
226  template <class V> using Map=map<Key,V,less<Key>,LibCAllocator<pair<const Key,V>>>;
227 
228  struct NoDataColumns: public std::exception
229  {
230  const char* what() const noexcept override {return "No data columns specified\nIf dataset has no data, try selecting counter";}
231  };
232 
233  struct DuplicateKey: public std::exception
234  {
235  std::string msg="Duplicate key";
237  DuplicateKey(const Key& x, const Tokens<SliceLabelToken>& tokens): key(x) {
238  for (auto& i: x)
239  msg+=":"+tokens[i];
240  msg+="\nTry selecting a different duplicate key action";
241  }
242  const char* what() const noexcept override {return msg.c_str();}
243  };
244 
245  struct InvalidData: public std::exception
246  {
247  string data;
248  string type;
249  string colName;
250  string msg;
251  InvalidData(const string& data, const string& type,const string& colName):
252  data(data), type(type), colName(colName)
253  {msg="Invalid data: "+data+" for "+type+" dimensioned column: "+colName;}
254  const char* what() const noexcept override {return msg.c_str();}
255  };
256 
257  struct ShortLine: public std::exception
258  {
259  std::string msg="Short line";
260  ShortLine(const Key& x, const Tokens<SliceLabelToken>& tokens) {
261  for (auto& i: x)
262  msg+=":"+tokens[i];
263  }
264  const char* what() const noexcept override {return msg.c_str();}
265  };
266 
267  struct MemoryExhausted: public std::exception
268  {
269  const char* what() const noexcept override {return "exhausted memory - try reducing the rank";}
270  };
271 
272  double quotedStoD(const string& s,size_t& charsProcd)
273  {
274  //strip possible quote characters
275  if (!s.empty() && s[0]==s[s.size()-1] && !isalnum(s[0]))
276  {
277  const double r=quotedStoD(s.substr(1,s.size()-2),charsProcd);
278  charsProcd+=2;
279  return r;
280  }
281  if (s.empty()) return nan(""); // treat empty cell as a missing value
282  // first try to read the cell as a number
283  try {
284  const double r=stod(s,&charsProcd);
285  if (charsProcd==s.size())
286  return r;
287  }
288  catch (...) {}
289  // if not, then strip any leading non-numerical characters ([^0-9.,+-])
290  auto n=s.find_first_of("0123456789,.+-");
291  return stod(s.substr(n),&charsProcd);
292  }
293 
294  string stripWSAndDecimalSep(const string& s)
295  {
296  string r;
297  for (auto c: s)
298  if (!isspace(c) && c!=',' && c!='.')
299  r+=c;
300  return r;
301  }
302 
303  // returns first position of v such that all elements in that or later
304  // positions are numerical or null
305  size_t firstNumerical(const vector<string>& v)
306  {
307  size_t r=0;
308  for (size_t i=0; i<v.size(); ++i)
309  try
310  {
311  if (!v[i].empty())
312  {
313  size_t c=0;
314  auto s=stripWSAndDecimalSep(v[i]);
315  quotedStoD(s,c);
316  if (c!=s.size())
317  r=i+1;
318  }
319  }
320  catch (...)
321  {
322  r=i+1;
323  }
324  return r;
325  }
326 
327  // returns true if all elements of v after start are empty
328  bool emptyTail(const vector<string>& v, size_t start)
329  {
330  for (size_t i=start; i<v.size(); ++i)
331  if (!v[i].empty()) return false;
332  return true;
333  }
334 }
335 
336 namespace minsky
337 {
338  bool isNumerical(const string& s)
339  {
340  size_t charsProcd=0;
341  const string stripped=stripWSAndDecimalSep(s);
342  try
343  {
344  quotedStoD(stripped, charsProcd);
345  }
346  catch (...) {return false;}
347  return charsProcd==stripped.size();
348  }
349 }
350 
351 namespace std
352 {
353  template <>
354  struct hash<Any>
355  {
356  size_t operator()(const Any& x) const {return x.hash;}
357  };
358  template <class T>
359  struct hash<vector<T>>
360  {
361  size_t operator()(const vector<T>& x) const {
362  size_t r=0;
363  for (auto& i: x) r^=std::hash<T>()(i);
364  return r;
365  }
366  };
367 
368 }
369 
370 void DataSpec::setDataArea(size_t row, size_t col)
371 {
372  m_nRowAxes=row;
373  m_nColAxes=std::min(col, maxColumn);
374  numCols=std::max(numCols, m_nColAxes);
375  if (headerRow>=row)
376  headerRow=row>0? row-1: 0;
377  if (dimensions.size()<nColAxes()) dimensions.resize(nColAxes());
378  if (dimensionNames.size()<nColAxes()) dimensionNames.resize(nColAxes());
379  // remove any dimensionCols > nColAxes
380  dimensionCols.erase(dimensionCols.lower_bound(nColAxes()), dimensionCols.end());
381  // adjust ignored columns
382  for (unsigned i=0; i<m_nColAxes; ++i)
383  dataCols.erase(i);
384  for (unsigned i=m_nColAxes; i<numCols && i<maxColumn; ++i)
385  dataCols.insert(i);
386 }
387 
388 
389 template <class TokenizerFunction>
390 void DataSpec::givenTFguessRemainder(std::istream& initialInput, std::istream& remainingInput, const TokenizerFunction& tf, uintmax_t fileSize)
391 {
392  starts.clear();
393  nCols=0;
394  row=0;
395  firstEmpty=numeric_limits<size_t>::max();
396  m_nRowAxes=0;
397 
398  const BusyCursor busy(minsky());
399  // we don't know how many times we'll be going around the loop here, so pick a largish number for the progress bar
400  ProgressUpdater pu(minsky().progressState,"Guessing CSV format",100);
401 
402  vector<set<size_t,less<size_t>,LibCAllocator<size_t>>> uniqueVals;
403  m_uniqueValues.clear(); // cleared in case of early return
404  try
405  {
406  if (!processChunk(initialInput, tf, CSVDialog::numInitialLines, uniqueVals)) return;
407  do
408  {
409  m_uniqueValues.resize(uniqueVals.size());
410  for (size_t i=0; i<uniqueVals.size(); ++i) m_uniqueValues[i]=uniqueVals[i].size();
411  if (fileSize==-1)
412  ++minsky().progressState;
413  else
414  pu.setProgress(double(remainingInput.tellg())/fileSize);
415  }
416  while (!processChunk(remainingInput, tf, row+CSVDialog::numInitialLines, uniqueVals));
417  m_uniqueValues.resize(uniqueVals.size());
418  for (size_t i=0; i<uniqueVals.size(); ++i) m_uniqueValues[i]=uniqueVals[i].size();
419  if (fileSize==-1)
420  ++minsky().progressState;
421  else
422  pu.setProgress(double(remainingInput.tellg())/fileSize);
423  }
424  catch (std::exception&)
425  {
426  // progressState throws an exception on being cancelled by the user
427  throw std::runtime_error("CSV format guess terminated by user, best guess specification used.");
428  }
429 }
430 
431 void DataSpec::guessRemainder(std::istream& initialInput, std::istream& remainingInput, char sep, uintmax_t fileSize)
432 {
433  separator=sep;
434  if (separator==' ')
435  givenTFguessRemainder(initialInput, remainingInput, SpaceSeparatorParser(escape,separator,quote),fileSize); //assumes merged whitespace separators
436  else
437  givenTFguessRemainder(initialInput, remainingInput, Parser(escape,separator,quote),fileSize);
438 }
439 
440 template <class TokenizerFunction, class UniqueVals>
441 bool DataSpec::processChunk(std::istream& input, const TokenizerFunction& tf, size_t until, UniqueVals& uniqueVals)
442 {
443  string buf;
444  const hash<string> h;
445  for (; getline(input, buf) && row<until; ++row)
446  {
447  if (buf.empty()) continue;
448  // remove trailing carriage returns
449  if (buf.back()=='\r') buf=buf.substr(0,buf.size()-1);
450  if (!buf.empty())
451  {
452  smatch match;
453  static const regex ravelHypercube("\"RavelHypercube=(.*)\"");
454  if (regex_match(buf, match, ravelHypercube))
455  try
456  {
457  string metadata=match[1];
458  // remove leaning toothpicks
459  metadata.erase(remove(metadata.begin(),metadata.end(),'\\'),metadata.end());
460  string horizontalName;
461  getline(input, buf);
462  static const regex re("HorizontalDimension=\"(.*)\"");
463  if (regex_match(buf, match, re))
464  {
465  horizontalName=match[1];
466  ++row;
467  }
468  populateFromRavelMetadata(metadata, horizontalName, row);
469  return false;
470  }
471  catch (...)
472  {
473  continue; // in case of error, ignore the RavelHypercube line.
474  }
475  }
476  const boost::tokenizer<TokenizerFunction> tok(buf.begin(),buf.end(), tf);
477  const vector<string> line(tok.begin(), tok.end());
478  if (line.size()>uniqueVals.size())
479  uniqueVals.resize(std::min(maxColumn, line.size()));
480  for (size_t i=0; i<std::min(line.size(), uniqueVals.size()); ++i)
481  uniqueVals[i].insert(h(line[i]));
482  starts.push_back(firstNumerical(line));
483  nCols=std::max(nCols, line.size());
484  if (starts.back()==line.size())
485  m_nRowAxes=row;
486  if (starts.size()-1 < firstEmpty && starts.back()<nCols && emptyTail(line, starts.back()))
487  firstEmpty=starts.size()-1;
488  }
489  // compute average of starts, then look for first row that drops below average
490  double sum=0;
491  for (unsigned long i=0; i<starts.size(); ++i)
492  sum+=starts[i];
493  const double av=sum/(starts.size());
494  for (; starts.size()>m_nRowAxes && (starts[m_nRowAxes]>av);
495  ++m_nRowAxes);
496  // if nRowAxes exceeds numInitialLines, assume first row is a header row, and that that is all there is.
497  if (m_nRowAxes>=row-1) m_nRowAxes=1;
498  m_nColAxes=0;
499  for (size_t i=nRowAxes(); i<starts.size(); ++i)
500  m_nColAxes=std::max(m_nColAxes,starts[i]);
501  // if more than 1 data column, treat the first row as an axis row
502  if (m_nRowAxes==0 && nCols-m_nColAxes>1)
503  m_nRowAxes=1;
504 
505  if (firstEmpty==m_nRowAxes) ++m_nRowAxes; // allow for possible colAxes header line
506  headerRow=nRowAxes()>0? nRowAxes()-1: 0;
507  size_t i=0;
508  dimensionCols.clear();
509  for (; i<nColAxes() && i<maxColumn; ++i) dimensionCols.insert(i);
510  dataCols.clear();
511  for (; i<nCols && i<maxColumn; ++i) dataCols.insert(i);
512  return !input;
513 }
514 
515 
516 
517 void DataSpec::guessFromStream(std::istream& input,uintmax_t fileSize)
518 {
519  size_t numCommas=0, numSemicolons=0, numTabs=0;
520  size_t row=0;
521  string buf;
522  ostringstream streamBuf;
523  for (; getline(input, buf) && row<CSVDialog::numInitialLines; ++row, streamBuf<<buf<<endl)
524  for (auto c:buf)
525  switch (c)
526  {
527  case ',':
528  numCommas++;
529  break;
530  case ';':
531  numSemicolons++;
532  break;
533  case '\t':
534  numTabs++;
535  break;
536  }
537 
538  {
539  istringstream inputCopy(streamBuf.str());
540  if (numCommas>0.9*row && numCommas>numSemicolons && numCommas>numTabs)
541  guessRemainder(inputCopy,input,',',fileSize);
542  else if (numSemicolons>0.9*row && numSemicolons>numTabs)
543  guessRemainder(inputCopy,input,';',fileSize);
544  else if (numTabs>0.9*row)
545  guessRemainder(inputCopy,input,'\t',fileSize);
546  else
547  guessRemainder(inputCopy,input,' ',fileSize);
548  }
549 }
550 
551 void DataSpec::populateFromRavelMetadata(const std::string& metadata, const string& horizontalName, size_t row)
552  {
553  vector<NamedDimension> ravelMetadata;
554  json(ravelMetadata,metadata);
555  headerRow=row+2;
556  setDataArea(headerRow, ravelMetadata.size());
557  dimensionNames.clear();
558  dimensions.clear();
559  for (auto& i: ravelMetadata)
560  if (i.name==horizontalName)
561  {
562  horizontalDimension=i.dimension;
563  horizontalDimName=i.name;
564  }
565  else
566  {
567  dimensions.push_back(i.dimension);
568  dimensionNames.push_back(i.name);
569  }
570  for (size_t i=0; i<dimensions.size(); ++i)
571  dimensionCols.insert(i);
572  }
573 
574 namespace minsky
575 {
576  // handle DOS files with '\r' '\n' line terminators
577  void chomp(string& buf)
578  {
579  if (!buf.empty() && buf.back()=='\r')
580  buf.erase(buf.size()-1);
581  }
582 
583  // gets a line, accounting for quoted newlines
584  bool getWholeLine(istream& input, string& line, const DataSpec& spec)
585  {
586  line.clear();
587  bool r=getline(input,line).good();
588  chomp(line);
589  while (r)
590  {
591  int quoteCount=0;
592  for (auto i: line)
593  if (i==spec.quote)
594  ++quoteCount;
595  if (quoteCount%2==0) break; // data line correctly terminated
596  string buf;
597  r=getline(input,buf).good(); // read next line and append
598  chomp(buf);
599  line+=buf;
600  }
601  escapeDoubledQuotes(line,spec);
602  return r || !line.empty();
603  }
604 
605  void escapeDoubledQuotes(std::string& line,const DataSpec& spec)
606  {
607  // replace doubled quotes with escape quote
608  for (size_t i=1; i<line.size(); ++i)
609  if (line[i]==spec.quote && line[i-1]==spec.quote &&
610  ((i==1 && (i==line.size()-1|| line[i+1]!=spec.quote)) || // deal with leading ""
611  (i>1 &&
612  ((line[i-2]!=spec.quote && line[i-2]!=spec.escape &&
613  (line[i-2]!=spec.separator || i==line.size()-1|| line[i+1]!=spec.quote)) // deal with ,''
614  || // deal with "" middle or end
615  (line[i-2]==spec.quote && (i==2 || line[i-3]==spec.separator || line[i-3]==spec.escape)))))) // deal with leading """
616  line[i-1]=spec.escape;
617  }
618 
620  struct OnError
621  {
623  template <class E> void operator()(const E& ex, size_t row) {throw ex;}
625  void rowKeyInsert(const Key&, size_t) {}
626  };
627 
628  template <class P>
629  struct ParseCSV
630  {
631  Map<double> tmpData;
632  Map<int> tmpCnt;
633  Tokens<SliceLabelToken> sliceLabelTokens;
634  vector<AnyVal> anyVal;
635  vector<unordered_map<typename Key::value_type, size_t>> dimLabels;
636  vector<typename Key::value_type> horizontalLabels;
637  Hypercube hc;
638 
639  template <class E>
640  ParseCSV(istream& input, const DataSpec& spec, uintmax_t fileSize, E& onError, bool checkValues=false)
641  {
642  //istream inputs do not support progress bars
643  parse(input,spec,fileSize,onError,checkValues);
644  }
645 
646  template <class E>
647  ParseCSV(const vector<string>& filenames, const DataSpec& spec, uintmax_t, E& onError, bool checkValues=false)
648  {
649  const ProgressUpdater pu(minsky().progressState, "Reading files",filenames.size());
650  for (auto& f: filenames)
651  {
652  ifstream input(f);
653  try
654  {
655  parse(input,spec,std::filesystem::file_size(f),onError);
656  }
657  catch (const std::exception& ex)
658  {
659  // prepend filename here
660  throw std::runtime_error(f+": "+ex.what());
661  }
662  ++minsky().progressState;
663  }
664  }
665 
666  template <class E>
667  void parse(istream& input, const DataSpec& spec, uintmax_t fileSize, E& onError, bool checkValues=false)
668  {
669  const BusyCursor busy(minsky());
670  const ProgressUpdater pu(minsky().progressState, "Parsing CSV",2);
671 
672  dimLabels.resize(spec.dimensionCols.size());
673 
674  size_t row=0, col=0;
675  P csvParser(spec.escape,spec.separator,spec.quote);
676  string buf;
677  bool tabularFormat=spec.dataCols.size()>1 || (spec.dataCols.empty() && spec.numCols>spec.nColAxes()+1);
678  uintmax_t bytesRead=0;
679 
680  try
681  {
682  if (hc.xvectors.empty()) // only set this first time around
683  {
684  for (auto i: spec.dimensionCols)
685  {
686  hc.xvectors.push_back(i<spec.dimensionNames.size()? spec.dimensionNames[i]: "dim"+str(i));
687  hc.xvectors.back().dimension=spec.dimensions[i];
688  anyVal.emplace_back(spec.dimensions[i]);
689  }
690  ++minsky().progressState;
691  if (tabularFormat)
692  {
693  anyVal.emplace_back(spec.horizontalDimension);
694  // legacy situation where all data columns are to the right
695  if (spec.dataCols.empty())
696  for (size_t i=spec.nColAxes(); i<spec.dimensionNames.size(); ++i)
697  {
698  col=i;
699  horizontalLabels.emplace_back(sliceLabelTokens[str(anyVal.back()(spec.dimensionNames[i]),spec.horizontalDimension.units)]);
700  }
701  else
702  {
703  // explicitly specified data columns
704  for (auto i: spec.dataCols)
705  {
706  col=i;
707  horizontalLabels.emplace_back(sliceLabelTokens[str(anyVal.back()(spec.dimensionNames[i]),spec.horizontalDimension.units)]);
708  }
709 
710  if (spec.headerRow<spec.nRowAxes())
711  {
712  // check whether any further columns exist that are not in
713  // spec.dimensionNames, and add these in as horizontal
714  // data dimension slices
715  for (; row<=spec.headerRow; ++row)
716  getWholeLine(input,buf,spec);
717  const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
718  auto field=tok.begin();
719  for (size_t i=0; i<spec.dimensionNames.size() && field!=tok.end(); ++i, ++field);
720  for (; field!=tok.end(); ++field)
721  if (field->empty())
722  horizontalLabels.emplace_back(sliceLabelTokens[""]);
723  else
724  horizontalLabels.emplace_back
725  (sliceLabelTokens[str(anyVal.back()(*field),spec.horizontalDimension.units)]);
726  }
727  }
728 
729  hc.xvectors.emplace_back(spec.horizontalDimName);
730  hc.xvectors.back().dimension=spec.horizontalDimension;
731  set<typename Key::value_type> uniqueLabels;
732  dimLabels.emplace_back();
733  for (auto& i: horizontalLabels)
734  if (!sliceLabelTokens[i].empty() && uniqueLabels.insert(i).second)
735  {
736  dimLabels.back()[i]=hc.xvectors.back().size();
737  hc.xvectors.back().emplace_back(sliceLabelTokens[i]);
738  }
739  }
740  }
741 
742 
743  for (; row<spec.nRowAxes(); ++row)
744  getWholeLine(input,buf,spec);
745 
746 
747  ++minsky().progressState;
748 
749  {
750  auto blankToken=sliceLabelTokens[""];
751  ProgressUpdater pu(minsky().progressState, "Reading data",1);
752  for (; getWholeLine(input, buf, spec); ++row)
753  {
754  const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
755 
756  Key key;
757  size_t dim=0, dataCols=0;
758  col=0;
759  for (auto field=tok.begin(); field!=tok.end(); ++col, ++field)
760  if (spec.dimensionCols.contains(col))
761  {
762  // detect blank data lines (favourite Excel artifact)
763  if (spec.dimensions[dim].type!=Dimension::string && field->empty())
764  goto invalidKeyGotoNextLine;
765 
766  if (dim>=hc.xvectors.size())
767  hc.xvectors.emplace_back("?"); // no header present
768  try
769  {
770  auto trimmedField=trimWS(*field);
771  if (trimmedField.empty() && spec.dimensions[col].type!=Dimension::string)
772  onError(InvalidData("<empty>",to_string(spec.dimensions[col].type),spec.dimensionNames[col]),row);
773  auto keyElem=anyVal[dim](trimmedField);
774  auto skeyElem=str(keyElem, spec.dimensions[dim].units);
775  if (dimLabels[dim].emplace(sliceLabelTokens[skeyElem], dimLabels[dim].size()).second)
776  hc.xvectors[dim].emplace_back(keyElem);
777  key.emplace_back(sliceLabelTokens[skeyElem]);
778  }
779  catch (...)
780  {
781  if (spec.dontFail)
782  goto invalidKeyGotoNextLine;
783  onError(InvalidData(*field,to_string(spec.dimensions[col].type),spec.dimensionNames[col]),row);
784  }
785  dim++;
786  }
787 
788  if (key.size()<hc.rank()-tabularFormat)
789  {
790  if (spec.dontFail)
791  goto invalidKeyGotoNextLine;
792  onError(ShortLine(key,sliceLabelTokens),row);
793  }
794 
795  col=0;
796  for (auto field=tok.begin(); field!=tok.end(); ++col,++field)
797  if ((spec.dataCols.empty() && col>=spec.nColAxes()) || spec.dataCols.contains(col) || col>=spec.maxColumn)
798  {
799  if (tabularFormat)
800  {
801  if (horizontalLabels[dataCols]==blankToken)
802  continue; // ignore blank labelled columns
803  key.emplace_back(horizontalLabels[dataCols]);
804  }
805  else if (dataCols)
806  break; // only 1 value column, everything to right ignored
807 
808  // remove thousands separators, and set decimal separator to '.' ("C" locale)
809  string s;
810  for (auto c: *field)
811  if (c==spec.decSeparator)
812  s+='.';
813  else if (!checkValues &&
814  ((s.empty() && (!isdigit(c)&&c!='-'&&c!='+')) ||
815  ((s=="-"||s=="+") && !isdigit(c))))
816  continue; // skip non-numeric prefix
817  else if (!isspace(c) && c!='.' && c!=',')
818  s+=c;
819 
820  // TODO - this disallows special floating point values - is this right?
821  bool valueExists=!s.empty() && s!="\\N" && (isdigit(s[0])||s[0]=='-'||s[0]=='+'||s[0]=='.');
822  if (checkValues && !valueExists && !s.empty() && s!="\\N") // ignore empty cells or explicit nulls
823  onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
824 
825  if (valueExists || !isnan(spec.missingValue))
826  {
827  if (spec.counter)
828  tmpData[key]+=1;
829  else
830  {
831  auto i=tmpData.find(key);
832  double v=spec.missingValue;
833  if (valueExists)
834  try
835  {
836  size_t end;
837  v=stod(s,&end);
838  if (checkValues && end<s.length())
839  onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
840  if (i==tmpData.end())
841  {
842  tmpData.emplace(key,v);
843  onError.rowKeyInsert(key,row);
844  }
845  }
846  catch (const std::bad_alloc&)
847  {throw;}
848  catch (...) // value misunderstood
849  {
850  if (checkValues) onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
851  if (isnan(spec.missingValue)) // if spec.missingValue is NaN, then don't populate the tmpData map
852  valueExists=false;
853  }
854  if (valueExists && i!=tmpData.end())
855  switch (spec.duplicateKeyAction)
856  {
858  if (!spec.dontFail)
859  onError(DuplicateKey(key,sliceLabelTokens),row);
860  case DataSpec::sum:
861  i->second+=v;
862  break;
863  case DataSpec::product:
864  i->second*=v;
865  break;
866  case DataSpec::min:
867  if (v<i->second)
868  i->second=v;
869  break;
870  case DataSpec::max:
871  if (v>i->second)
872  i->second=v;
873  break;
874  case DataSpec::av:
875  {
876  int& c=tmpCnt[key]; // c initialised to 0
877  i->second=((c+1)*i->second + v)/(c+2);
878  c++;
879  }
880  break;
881  }
882  }
883  }
884  dataCols++;
885  if (tabularFormat)
886  key.pop_back();
887  else
888  break; // only one column of data needs to be read
889  }
890 
891  if (!dataCols)
892  {
893  if (spec.counter || spec.dontFail)
894  tmpData[key]+=1;
895  else
896  onError(ShortLine(key,sliceLabelTokens),row);
897  }
898 
899 
900  bytesRead+=buf.size();
901  pu.setProgress(double(bytesRead)/fileSize);
902  invalidKeyGotoNextLine:;
903  }
904  }
905  }
906  catch (const std::bad_alloc&)
907  { // replace with a more user friendly error message
908  throw MemoryExhausted();
909  }
910  catch (const std::length_error&)
911  { // replace with a more user friendly error message
912  throw MemoryExhausted();
913  }
914  catch (const std::exception& ex)
915  {
916  auto msg=string(ex.what())+" at line:"+to_string(row)+", col:"+to_string(col);
917  if (col<spec.dimensionNames.size())
918  msg+=" ("+spec.dimensionNames[col]+")";
919  throw std::runtime_error(msg);
920  }
921  }
922  };
923 
924  template <class P, class E, class S>
925  void loadValueFromCSVFileT(VariableValue& vv, S& stream, const DataSpec& spec, E& onError)
926  {
927  const BusyCursor busy(minsky());
928  const ProgressUpdater pu(minsky().progressState, "Importing CSVs",4);
929 
930  {
931  // check dimension names are all distinct
932  set<string> dimNames{spec.horizontalDimName};
933  for (auto i: spec.dimensionCols)
934  if (!dimNames.insert(spec.dimensionNames[i]).second)
935  throw runtime_error("Duplicate dimension: "+spec.dimensionNames[i]);
936  }
937 
938  try
939  {
940  ParseCSV<P> parseCSV(stream,spec,0/*not used*/,onError);
941 
942  auto& tmpData=parseCSV.tmpData;
943  auto& dimLabels=parseCSV.dimLabels;
944  auto& hc=parseCSV.hc;
945 
946  // remove zero length dimensions
947  auto d=dimLabels.begin();
948  assert(hc.xvectors.size()==dimLabels.size());
949  for (auto i=hc.xvectors.begin(); i!=hc.xvectors.end();)
950  if (i->size()<2)
951  {
952  hc.xvectors.erase(i);
953  }
954  else
955  {
956  ++i;
957  ++d;
958  }
959  assert(hc.xvectors.size()<=dimLabels.size());
960 
961  for (auto& xv: hc.xvectors)
962  xv.imposeDimension();
963  ++minsky().progressState;
964 
965  if (hc.logNumElements()>log(numeric_limits<size_t>::max()))
966  throw runtime_error("Hypercube dimensionality exceeds maximum size, results are likely to be garbage.\n"
967  "Suggest rolling up one or more axes by ignoring them, and setting 'Duplicate Key Action' as appropriate");
968 
969  if (log(tmpData.size())-hc.logNumElements()>=log(0.5))
970  { // dense case
971  vv.index({});
972  if (cminsky().checkMemAllocation(2*hc.numElements()*sizeof(double))==Minsky::abort)
973  throw MemoryExhausted();
974  vv.hypercube(hc);
975  // stash the data into vv tensorInit field
976  vv.tensorInit.index({});
977  vv.tensorInit.hypercube(hc);
978  for (auto& i: vv.tensorInit)
979  i=spec.missingValue;
980  auto dims=vv.hypercube().dims();
981  const ProgressUpdater pu(minsky().progressState,"Loading data",tmpData.size());
982  for (auto& i: tmpData)
983  {
984  size_t idx=0;
985  assert (hc.rank()<=i.first.size());
986  assert(dimLabels.size()>=hc.rank());
987  int j=hc.rank()-1, k=i.first.size()-1;
988  while (j>=0 && k>=0)
989  {
990  while (dimLabels[k].size()<2) --k; // skip over elided dimensions
991  auto dimLabel=dimLabels[k].find(i.first[k]);
992  assert(dimLabel!=dimLabels[k].end());
993  idx = (idx*dims[j]) + dimLabel->second;
994  --j; --k;
995  }
996  vv.tensorInit[idx]=i.second;
997  ++minsky().progressState;
998  }
999  }
1000  else
1001  { // sparse case
1002  if (cminsky().checkMemAllocation(6*tmpData.size()*sizeof(double))==Minsky::abort)
1003  throw MemoryExhausted();
1004  auto dims=hc.dims();
1005  const ProgressUpdater pu(minsky().progressState,"Indexing and loading",tmpData.size()+1);
1006 
1007  map<size_t,double,less<size_t>,LibCAllocator<pair<const size_t,double>>> indexValue; // intermediate stash to sort index vector
1008  {
1009  for (auto& i: tmpData)
1010  {
1011  size_t idx=0;
1012  assert (dims.size()<=i.first.size());
1013  assert(dimLabels.size()>=dims.size());
1014  int j=dims.size()-1, k=i.first.size()-1;
1015  while (j>=0 && k>=0) // changed from for loop to while loop at CodeQL's insistence
1016  {
1017  while (dimLabels[k].size()<2) --k; // skip over elided dimensions
1018  auto dimLabel=dimLabels[k].find(i.first[k]);
1019  assert(dimLabel!=dimLabels[k].end());
1020  idx = (idx*dims[j]) + dimLabel->second;
1021  --j;
1022  --k;
1023  }
1024  if (!isnan(i.second))
1025  indexValue.emplace(idx, i.second);
1026  ++minsky().progressState;
1027  }
1028 
1029  vv.tensorInit=indexValue;
1030  vv.index(indexValue);
1031  vv.tensorInit.hypercube(hc);
1032  ++minsky().progressState;
1033  }
1034  }
1035  minsky().progressState.title="Cleaning up";
1036  minsky().progressState.displayProgress();
1037  }
1038  catch (const std::bad_alloc&)
1039  { // replace with a more user friendly error message
1040  throw MemoryExhausted();
1041  }
1042  catch (const std::length_error&)
1043  { // replace with a more user friendly error message
1044  throw MemoryExhausted();
1045  }
1046  }
1047 
1048  template <class S>
1049  void loadValueFromCSVFileS(VariableValue& v, S& filenames, const DataSpec& spec)
1050  {
1051  OnError onError;
1052  if (spec.separator==' ')
1053  loadValueFromCSVFileT<SpaceSeparatorParser>(v,filenames,spec,onError);
1054  else
1055  loadValueFromCSVFileT<Parser>(v,filenames,spec,onError);
1056  }
1057 
1058  void loadValueFromCSVFile(VariableValue& v, const vector<string>& filenames, const DataSpec& spec)
1059  {loadValueFromCSVFileS(v,filenames,spec);}
1060  void loadValueFromCSVFile(VariableValue& v, istream& input, const DataSpec& spec)
1061  {loadValueFromCSVFileS(v,input,spec);}
1062 
1063  struct FailedToRewind: public std::exception
1064  {
1065  const char* what() const noexcept override {return "Failed to rewind input";}
1066  };
1067 
1068  template <class P>
1069  void reportFromCSVFileT(istream& input, ostream& output, const DataSpec& spec, uintmax_t fileSize )
1070  {
1071  const BusyCursor busy(minsky());
1072  const ProgressUpdater pu(minsky().progressState, "Generating report",3);
1073  // set up off-heap memory allocator, and ensure it is torn down at exit
1074 // TrackingAllocatorBase::allocatePool();
1075 // auto onExit=onStackExit([](){TrackingAllocatorBase::deallocatePool();});
1076 
1077  struct ErrorReporter //: public OnError // using duck typing, not dynamic polymorphism
1078  {
1079  Map<size_t> firstRow;
1080  map<size_t,Key> duplicates;
1081  map<size_t,string> invalidData;
1082  void operator()(const DuplicateKey& ex, size_t row) {
1083  duplicates.emplace(firstRow[ex.key],ex.key);
1084  duplicates.emplace(row,ex.key);
1085  }
1086  void operator()(const InvalidData& ex, size_t row) {invalidData.emplace(row, ex.msg);}
1087  void operator()(const ShortLine& ex, size_t row) {invalidData.emplace(row, ex.msg);}
1089  void rowKeyInsert(const Key& key, size_t row) {firstRow.emplace(key,row);}
1090  } onError;
1091 
1092  // parse file to extract error locations
1093  ParseCSV<P> parseCSV(input, spec, fileSize, onError, /*checkValues=*/true);
1094 
1095  input.clear();
1096  input.seekg(0);
1097  if (!input) throw FailedToRewind();
1098  string buf;
1099  size_t row=0;
1100  const string sep{spec.separator};
1101  multimap<Key,string> duplicateLines;
1102  vector<string> invalidDataLines;
1103 
1104  {
1105  // extract all error lines
1106  ProgressUpdater pu(minsky().progressState, "Extracting errors",3);
1107  size_t bytesRead=0;
1108  for (; getWholeLine(input, buf, spec); ++row)
1109  {
1110  if (onError.duplicates.contains(row))
1111  {
1112  string msg="Duplicate key";
1113  msg+=sep; msg+=buf;
1114  duplicateLines.emplace(onError.duplicates[row],msg);
1115  }
1116  if (onError.invalidData.contains(row))
1117  {
1118  string msg=onError.invalidData[row];
1119  msg+=sep; msg+=buf;
1120  invalidDataLines.push_back(msg);
1121  }
1122  bytesRead+=buf.size();
1123  pu.setProgress(double(bytesRead)/fileSize);
1124  }
1125  }
1126 
1127  // now output report
1128  input.clear();
1129  input.seekg(0);
1130  if (!input) throw FailedToRewind();
1131  {
1132  ProgressUpdater pu(minsky().progressState, "Writing report",3);
1133  size_t bytesRead=0;
1134  // process header
1135  for (row=0; row<spec.nRowAxes() && getWholeLine(input, buf, spec); ++row)
1136  {
1137  if (row==spec.headerRow)
1138  output<<"Error"<<sep;
1139  output<<buf<<endl;
1140  bytesRead+=buf.size();
1141  pu.setProgress(double(bytesRead)/fileSize);
1142  }
1143 
1144  // process invalid data
1145  for (auto& i: invalidDataLines)
1146  output<<i<<endl;
1147  // process duplicates
1148  for (auto& i: duplicateLines)
1149  output<<i.second<<endl;
1150  // process remaining good part of the file
1151  for (; getWholeLine(input, buf, spec); ++row)
1152  {
1153  if (!onError.duplicates.contains(row) && !onError.invalidData.contains(row))
1154  output<<sep+buf<<endl;
1155  bytesRead+=buf.size();
1156  pu.setProgress(double(bytesRead)/fileSize);
1157  }
1158  }
1159  }
1160 
1161  void reportFromCSVFile(istream& input, ostream& output, const DataSpec& spec, uintmax_t fileSize)
1162  {
1163  if (spec.separator==' ')
1164  reportFromCSVFileT<SpaceSeparatorParser>(input,output,spec,fileSize);
1165  else
1166  reportFromCSVFileT<Parser>(input,output,spec,fileSize);
1167  }
1168 
1169 
1170 }
1171 
function f
Definition: canvas.m:1
void loadValueFromCSVFileS(VariableValue &v, S &filenames, const DataSpec &spec)
Definition: CSVParser.cc:1049
map< Key, V, less< Key >, LibCAllocator< pair< const Key, V > >> Map
Definition: CSVParser.cc:226
std::size_t nColAxes() const
start column of the data area
Definition: CSVParser.h:48
size_t operator()(const Any &x) const
Definition: CSVParser.cc:356
void reportFromCSVFile(istream &input, ostream &output, const DataSpec &spec, uintmax_t fileSize)
creates a report CSV file from input, with errors sorted at begining of file, with a column for error...
Definition: CSVParser.cc:1161
bool operator()(I &next, I end, std::string &tok)
Definition: CSVParser.cc:165
vector< const string * > tokenRefs
Definition: CSVParser.cc:206
DuplicateKeyAction duplicateKeyAction
boost::tokenizer< Parser > Tokenizer
Definition: CSVParser.cc:157
Definition: input.py:1
const Hypercube & hypercube() const override
string stripWSAndDecimalSep(const string &s)
Definition: CSVParser.cc:294
handle reporting errors in loadValueFromCSVFileT when loading files
Definition: CSVParser.cc:620
escapedListSeparator::EscapedListSeparator< char > Parser
Definition: CSVParser.cc:155
An any with cached hash.
Definition: CSVParser.cc:192
size_t firstNumerical(const vector< string > &v)
Definition: CSVParser.cc:305
CLASSDESC_ACCESS_EXPLICIT_INSTANTIATION(minsky::DataSpec)
STL namespace.
Tokens< SliceLabelToken > sliceLabelTokens
Definition: CSVParser.cc:633
std::size_t numCols
number of columns in CSV. Must be > dataColOffset
bool operator==(const Any &x) const
Definition: CSVParser.cc:197
double quotedStoD(const string &s, size_t &charsProcd)
Definition: CSVParser.cc:272
void do_escape(iterator &next, iterator end, Token &tok)
Definition: CSVParser.cc:79
EscapedListSeparator(Char e='\\', Char c=',', Char q='\"')
Definition: CSVParser.cc:105
civita::Dimension horizontalDimension
Map< int > tmpCnt
Definition: CSVParser.cc:632
void escapeDoubledQuotes(std::string &line, const DataSpec &spec)
replace doubled quotes with escaped quotes
Definition: CSVParser.cc:605
DuplicateKey(const Key &x, const Tokens< SliceLabelToken > &tokens)
Definition: CSVParser.cc:237
vector< AnyVal > anyVal
Definition: CSVParser.cc:634
std::vector< civita::Dimension > dimensions
const char * what() const noexcept override
Definition: CSVParser.cc:242
void chomp(string &buf)
Definition: CSVParser.cc:577
bool operator<(const Any &x) const
Definition: CSVParser.cc:196
bool emptyTail(const vector< string > &v, size_t start)
Definition: CSVParser.cc:328
vector< unordered_map< typename Key::value_type, size_t > > dimLabels
Definition: CSVParser.cc:635
bool counter
count data items, not read their values
std::basic_string< Char, Traits > string_type
Definition: CSVParser.cc:53
ParseCSV(istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
Definition: CSVParser.cc:640
bool isNumerical(const std::string &s)
Definition: CSVParser.cc:338
std::string horizontalDimName
bool dontFail
do not throw an error on corrupt data
bool processChunk(std::istream &input, const T &tf, size_t until, U &)
process chunk of input, updating guessed spec
EscapedListSeparator(string_type e, string_type c, string_type q)
Definition: CSVParser.cc:109
Creation and access to the minskyTCL_obj object, which has code to record whenever Minsky&#39;s state cha...
Definition: constMap.h:22
void guessRemainder(std::istream &initialInput, std::istream &remainingInput, char separator, uintmax_t fileSize)
figure out the tokenizer function and call givenTFguessRemainder
Definition: CSVParser.cc:431
TensorVal tensorInit
when init is a tensor of values, this overrides the init string
Definition: variableValue.h:50
std::string trimWS(const std::string &s)
Definition: str.h:49
const char * what() const noexcept override
Definition: CSVParser.cc:269
std::size_t headerRow
number of header rows
SpaceSeparatorParser(char escape='\\', char sep=' ', char quote='"')
Definition: CSVParser.cc:162
void setProgress(double fraction)
Sets the progress to a given fraction of this stack&#39;s allocation.
Definition: progress.h:79
const char * what() const noexcept override
Definition: CSVParser.cc:254
void parse(istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
Definition: CSVParser.cc:667
const Index & index(Index &&i) override
std::string str(T x)
utility function to create a string representation of a numeric type
Definition: str.h:33
void reportFromCSVFileT(istream &input, ostream &output, const DataSpec &spec, uintmax_t fileSize)
Definition: CSVParser.cc:1069
const Minsky & cminsky()
const version to help in const correctness
Definition: minsky.h:549
string data
data received in field
Definition: CSVParser.cc:247
size_t operator()(const vector< T > &x) const
Definition: CSVParser.cc:361
void loadValueFromCSVFile(VariableValue &v, const vector< string > &filenames, const DataSpec &spec)
load a variableValue from a list of files according to data spec
Definition: CSVParser.cc:1058
void loadValueFromCSVFileT(VariableValue &vv, S &stream, const DataSpec &spec, E &onError)
Definition: CSVParser.cc:925
vector< typename Key::value_type > horizontalLabels
Definition: CSVParser.cc:636
std::vector< std::string > dimensionNames
void givenTFguessRemainder(std::istream &initialInput, std::istream &remainingInput, const T &tf, uintmax_t fileSize)
try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char...
ParseCSV(const vector< string > &filenames, const DataSpec &spec, uintmax_t, E &onError, bool checkValues=false)
Definition: CSVParser.cc:647
const string & operator[](T i) const
Definition: CSVParser.cc:218
const char * what() const noexcept override
Definition: CSVParser.cc:264
const char * what() const noexcept override
Definition: CSVParser.cc:1065
std::size_t maxColumn
maximum number of columns that can be configured independently. Columns after this limit are treated ...
Definition: CSVParser.h:43
bool getWholeLine(istream &input, string &line, const DataSpec &spec)
Definition: CSVParser.cc:584
InvalidData(const string &data, const string &type, const string &colName)
Definition: CSVParser.cc:251
std::size_t nRowAxes() const
start row of the data area
Definition: CSVParser.h:46
bool operator()(InputIterator &next, InputIterator end, Token &tok)
Definition: CSVParser.cc:115
vector< SliceLabelToken, LibCAllocator< SliceLabelToken > > Key
Definition: CSVParser.cc:225
void rowKeyInsert(const Key &, size_t)
update a map of keys to first rows for duplicate key processing
Definition: CSVParser.cc:625
string to_string(CONST84 char *x)
Definition: minskyTCLObj.h:33
const char * what() const noexcept override
Definition: CSVParser.cc:230
Minsky & minsky()
global minsky object
Definition: minskyTCL.cc:51
void guessFromStream(std::istream &file, uintmax_t fileSize=uintmax_t(-1))
initial stab at dataspec from examining stream
Definition: CSVParser.cc:517
void operator()(const E &ex, size_t row)
called on error - ex message to pass on, row - current row
Definition: CSVParser.cc:623
std::set< unsigned > dataCols
Map< double > tmpData
map of data by key
Definition: CSVParser.cc:631
ShortLine(const Key &x, const Tokens< SliceLabelToken > &tokens)
Definition: CSVParser.cc:260
Expr log(const Expr &x)
Definition: expr.h:120
unordered_map< string, T > tokens
Definition: CSVParser.cc:205
static const unsigned numInitialLines
Definition: CSVDialog.h:44
Hypercube hc
Definition: CSVParser.cc:637
void setDataArea(std::size_t row, std::size_t col)
set top left cell of the data area
Definition: CSVParser.cc:370
void populateFromRavelMetadata(const std::string &metadata, const std::string &horizontalName, std::size_t row)
populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerR...
Definition: CSVParser.cc:551
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored