Minsky
minsky::ParseCSV< P > Struct Template Reference
Collaboration diagram for minsky::ParseCSV< P >:
Collaboration graph

Public Member Functions

template<class E >
 ParseCSV (istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
 
template<class E >
 ParseCSV (const vector< string > &filenames, const DataSpec &spec, uintmax_t, E &onError, bool checkValues=false)
 
template<class E >
void parse (istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
 

Public Attributes

Map< double > tmpData
 map of data by key More...
 
Map< int > tmpCnt
 
Tokens< SliceLabelToken > sliceLabelTokens
 
vector< AnyVal > anyVal
 
vector< unordered_map< typename Key::value_type, size_t > > dimLabels
 
vector< typename Key::value_type > horizontalLabels
 
Hypercube hc
 

Detailed Description

template<class P>
struct minsky::ParseCSV< P >

Definition at line 629 of file CSVParser.cc.

Constructor & Destructor Documentation

◆ ParseCSV() [1/2]

template<class P>
template<class E >
minsky::ParseCSV< P >::ParseCSV ( istream &  input,
const DataSpec spec,
uintmax_t  fileSize,
E &  onError,
bool  checkValues = false 
)
inline

Definition at line 640 of file CSVParser.cc.

641  {
642  //istream inputs do not support progress bars
643  parse(input,spec,fileSize,onError,checkValues);
644  }
Definition: input.py:1
void parse(istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
Definition: CSVParser.cc:667

◆ ParseCSV() [2/2]

template<class P>
template<class E >
minsky::ParseCSV< P >::ParseCSV ( const vector< string > &  filenames,
const DataSpec spec,
uintmax_t  ,
E &  onError,
bool  checkValues = false 
)
inline

Definition at line 647 of file CSVParser.cc.

References f, and minsky::minsky().

648  {
649  const ProgressUpdater pu(minsky().progressState, "Reading files",filenames.size());
650  for (auto& f: filenames)
651  {
652  ifstream input(f);
653  try
654  {
655  parse(input,spec,std::filesystem::file_size(f),onError);
656  }
657  catch (const std::exception& ex)
658  {
659  // prepend filename here
660  throw std::runtime_error(f+": "+ex.what());
661  }
662  ++minsky().progressState;
663  }
664  }
function f
Definition: canvas.m:1
Definition: input.py:1
void parse(istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
Definition: CSVParser.cc:667
Minsky & minsky()
global minsky object
Definition: minskyTCL.cc:51
Here is the call graph for this function:

Member Function Documentation

◆ parse()

template<class P>
template<class E >
void minsky::ParseCSV< P >::parse ( istream &  input,
const DataSpec spec,
uintmax_t  fileSize,
E &  onError,
bool  checkValues = false 
)
inline

Definition at line 667 of file CSVParser.cc.

References minsky::DataSpecSchema::av, minsky::DataSpecSchema::counter, minsky::DataSpecSchema::dataCols, minsky::DataSpecSchema::decSeparator, minsky::DataSpecSchema::dimensionCols, minsky::DataSpecSchema::dimensionNames, minsky::DataSpecSchema::dimensions, minsky::DataSpecSchema::dontFail, minsky::DataSpecSchema::duplicateKeyAction, minsky::DataSpecSchema::escape, minsky::getWholeLine(), minsky::DataSpecSchema::headerRow, minsky::DataSpecSchema::horizontalDimension, minsky::DataSpecSchema::horizontalDimName, minsky::anonymous_namespace{userFunction.cc}::isnan(), minsky::DataSpecSchema::max, minsky::DataSpec::maxColumn, minsky::DataSpecSchema::min, minsky::minsky(), minsky::DataSpecSchema::missingValue, minsky::DataSpec::nColAxes(), minsky::DataSpec::nRowAxes(), minsky::DataSpecSchema::numCols, minsky::DataSpecSchema::product, minsky::DataSpecSchema::quote, minsky::DataSpecSchema::separator, minsky::ProgressUpdater::setProgress(), minsky::str(), minsky::DataSpecSchema::sum, minsky::DataSpecSchema::throwException, minsky::to_string(), and minsky::trimWS().

668  {
669  const BusyCursor busy(minsky());
670  const ProgressUpdater pu(minsky().progressState, "Parsing CSV",2);
671 
672  dimLabels.resize(spec.dimensionCols.size());
673 
674  size_t row=0, col=0;
675  P csvParser(spec.escape,spec.separator,spec.quote);
676  string buf;
677  bool tabularFormat=spec.dataCols.size()>1 || (spec.dataCols.empty() && spec.numCols>spec.nColAxes()+1);
678  uintmax_t bytesRead=0;
679 
680  try
681  {
682  if (hc.xvectors.empty()) // only set this first time around
683  {
684  for (auto i: spec.dimensionCols)
685  {
686  hc.xvectors.push_back(i<spec.dimensionNames.size()? spec.dimensionNames[i]: "dim"+str(i));
687  hc.xvectors.back().dimension=spec.dimensions[i];
688  anyVal.emplace_back(spec.dimensions[i]);
689  }
690  ++minsky().progressState;
691  if (tabularFormat)
692  {
693  anyVal.emplace_back(spec.horizontalDimension);
694  // legacy situation where all data columns are to the right
695  if (spec.dataCols.empty())
696  for (size_t i=spec.nColAxes(); i<spec.dimensionNames.size(); ++i)
697  {
698  col=i;
699  horizontalLabels.emplace_back(sliceLabelTokens[str(anyVal.back()(spec.dimensionNames[i]),spec.horizontalDimension.units)]);
700  }
701  else
702  {
703  // explicitly specified data columns
704  for (auto i: spec.dataCols)
705  {
706  col=i;
707  horizontalLabels.emplace_back(sliceLabelTokens[str(anyVal.back()(spec.dimensionNames[i]),spec.horizontalDimension.units)]);
708  }
709 
710  if (spec.headerRow<spec.nRowAxes())
711  {
712  // check whether any further columns exist that are not in
713  // spec.dimensionNames, and add these in as horizontal
714  // data dimension slices
715  for (; row<=spec.headerRow; ++row)
716  getWholeLine(input,buf,spec);
717  const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
718  auto field=tok.begin();
719  for (size_t i=0; i<spec.dimensionNames.size() && field!=tok.end(); ++i, ++field);
720  for (; field!=tok.end(); ++field)
721  if (field->empty())
722  horizontalLabels.emplace_back(sliceLabelTokens[""]);
723  else
724  horizontalLabels.emplace_back
725  (sliceLabelTokens[str(anyVal.back()(*field),spec.horizontalDimension.units)]);
726  }
727  }
728 
729  hc.xvectors.emplace_back(spec.horizontalDimName);
730  hc.xvectors.back().dimension=spec.horizontalDimension;
731  set<typename Key::value_type> uniqueLabels;
732  dimLabels.emplace_back();
733  for (auto& i: horizontalLabels)
734  if (!sliceLabelTokens[i].empty() && uniqueLabels.insert(i).second)
735  {
736  dimLabels.back()[i]=hc.xvectors.back().size();
737  hc.xvectors.back().emplace_back(sliceLabelTokens[i]);
738  }
739  }
740  }
741 
742 
743  for (; row<spec.nRowAxes(); ++row)
744  getWholeLine(input,buf,spec);
745 
746 
747  ++minsky().progressState;
748 
749  {
750  auto blankToken=sliceLabelTokens[""];
751  ProgressUpdater pu(minsky().progressState, "Reading data",1);
752  for (; getWholeLine(input, buf, spec); ++row)
753  {
754  const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
755 
756  Key key;
757  size_t dim=0, dataCols=0;
758  col=0;
759  for (auto field=tok.begin(); field!=tok.end(); ++col, ++field)
760  if (spec.dimensionCols.contains(col))
761  {
762  // detect blank data lines (favourite Excel artifact)
763  if (spec.dimensions[dim].type!=Dimension::string && field->empty())
764  goto invalidKeyGotoNextLine;
765 
766  if (dim>=hc.xvectors.size())
767  hc.xvectors.emplace_back("?"); // no header present
768  try
769  {
770  auto trimmedField=trimWS(*field);
771  if (trimmedField.empty() && spec.dimensions[col].type!=Dimension::string)
772  onError(InvalidData("<empty>",to_string(spec.dimensions[col].type),spec.dimensionNames[col]),row);
773  auto keyElem=anyVal[dim](trimmedField);
774  auto skeyElem=str(keyElem, spec.dimensions[dim].units);
775  if (dimLabels[dim].emplace(sliceLabelTokens[skeyElem], dimLabels[dim].size()).second)
776  hc.xvectors[dim].emplace_back(keyElem);
777  key.emplace_back(sliceLabelTokens[skeyElem]);
778  }
779  catch (...)
780  {
781  if (spec.dontFail)
782  goto invalidKeyGotoNextLine;
783  onError(InvalidData(*field,to_string(spec.dimensions[col].type),spec.dimensionNames[col]),row);
784  }
785  dim++;
786  }
787 
788  if (key.size()<hc.rank()-tabularFormat)
789  {
790  if (spec.dontFail)
791  goto invalidKeyGotoNextLine;
792  onError(ShortLine(key,sliceLabelTokens),row);
793  }
794 
795  col=0;
796  for (auto field=tok.begin(); field!=tok.end(); ++col,++field)
797  if ((spec.dataCols.empty() && col>=spec.nColAxes()) || spec.dataCols.contains(col) || col>=spec.maxColumn)
798  {
799  if (tabularFormat)
800  {
801  if (horizontalLabels[dataCols]==blankToken)
802  continue; // ignore blank labelled columns
803  key.emplace_back(horizontalLabels[dataCols]);
804  }
805  else if (dataCols)
806  break; // only 1 value column, everything to right ignored
807 
808  // remove thousands separators, and set decimal separator to '.' ("C" locale)
809  string s;
810  for (auto c: *field)
811  if (c==spec.decSeparator)
812  s+='.';
813  else if (!checkValues &&
814  ((s.empty() && (!isdigit(c)&&c!='-'&&c!='+')) ||
815  ((s=="-"||s=="+") && !isdigit(c))))
816  continue; // skip non-numeric prefix
817  else if (!isspace(c) && c!='.' && c!=',')
818  s+=c;
819 
820  // TODO - this disallows special floating point values - is this right?
821  bool valueExists=!s.empty() && s!="\\N" && (isdigit(s[0])||s[0]=='-'||s[0]=='+'||s[0]=='.');
822  if (checkValues && !valueExists && !s.empty() && s!="\\N") // ignore empty cells or explicit nulls
823  onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
824 
825  if (valueExists || !isnan(spec.missingValue))
826  {
827  if (spec.counter)
828  tmpData[key]+=1;
829  else
830  {
831  auto i=tmpData.find(key);
832  double v=spec.missingValue;
833  if (valueExists)
834  try
835  {
836  size_t end;
837  v=stod(s,&end);
838  if (checkValues && end<s.length())
839  onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
840  if (i==tmpData.end())
841  {
842  tmpData.emplace(key,v);
843  onError.rowKeyInsert(key,row);
844  }
845  }
846  catch (const std::bad_alloc&)
847  {throw;}
848  catch (...) // value misunderstood
849  {
850  if (checkValues) onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
851  if (isnan(spec.missingValue)) // if spec.missingValue is NaN, then don't populate the tmpData map
852  valueExists=false;
853  }
854  if (valueExists && i!=tmpData.end())
855  switch (spec.duplicateKeyAction)
856  {
858  if (!spec.dontFail)
859  onError(DuplicateKey(key,sliceLabelTokens),row);
860  case DataSpec::sum:
861  i->second+=v;
862  break;
863  case DataSpec::product:
864  i->second*=v;
865  break;
866  case DataSpec::min:
867  if (v<i->second)
868  i->second=v;
869  break;
870  case DataSpec::max:
871  if (v>i->second)
872  i->second=v;
873  break;
874  case DataSpec::av:
875  {
876  int& c=tmpCnt[key]; // c initialised to 0
877  i->second=((c+1)*i->second + v)/(c+2);
878  c++;
879  }
880  break;
881  }
882  }
883  }
884  dataCols++;
885  if (tabularFormat)
886  key.pop_back();
887  else
888  break; // only one column of data needs to be read
889  }
890 
891  if (!dataCols)
892  {
893  if (spec.counter || spec.dontFail)
894  tmpData[key]+=1;
895  else
896  onError(ShortLine(key,sliceLabelTokens),row);
897  }
898 
899 
900  bytesRead+=buf.size();
901  pu.setProgress(double(bytesRead)/fileSize);
902  invalidKeyGotoNextLine:;
903  }
904  }
905  }
906  catch (const std::bad_alloc&)
907  { // replace with a more user friendly error message
908  throw MemoryExhausted();
909  }
910  catch (const std::length_error&)
911  { // replace with a more user friendly error message
912  throw MemoryExhausted();
913  }
914  catch (const std::exception& ex)
915  {
916  auto msg=string(ex.what())+" at line:"+to_string(row)+", col:"+to_string(col);
917  if (col<spec.dimensionNames.size())
918  msg+=" ("+spec.dimensionNames[col]+")";
919  throw std::runtime_error(msg);
920  }
921  }
std::size_t nColAxes() const
start column of the data area
Definition: CSVParser.h:48
DuplicateKeyAction duplicateKeyAction
Definition: input.py:1
Tokens< SliceLabelToken > sliceLabelTokens
Definition: CSVParser.cc:633
std::size_t numCols
number of columns in CSV. Must be > dataColOffset
civita::Dimension horizontalDimension
Map< int > tmpCnt
Definition: CSVParser.cc:632
vector< AnyVal > anyVal
Definition: CSVParser.cc:634
std::vector< civita::Dimension > dimensions
vector< unordered_map< typename Key::value_type, size_t > > dimLabels
Definition: CSVParser.cc:635
bool counter
count data items, not read their values
std::string horizontalDimName
bool dontFail
do not throw an error on corrupt data
std::string trimWS(const std::string &s)
Definition: str.h:49
std::size_t headerRow
number of header rows
std::string str(T x)
utility function to create a string representation of a numeric type
Definition: str.h:33
vector< typename Key::value_type > horizontalLabels
Definition: CSVParser.cc:636
std::vector< std::string > dimensionNames
std::size_t maxColumn
maximum number of columns that can be configured independently. Columns after this limit are treated ...
Definition: CSVParser.h:43
bool getWholeLine(istream &input, string &line, const DataSpec &spec)
Definition: CSVParser.cc:584
std::size_t nRowAxes() const
start row of the data area
Definition: CSVParser.h:46
vector< SliceLabelToken, LibCAllocator< SliceLabelToken > > Key
Definition: CSVParser.cc:225
string to_string(CONST84 char *x)
Definition: minskyTCLObj.h:33
Minsky & minsky()
global minsky object
Definition: minskyTCL.cc:51
std::set< unsigned > dataCols
Map< double > tmpData
map of data by key
Definition: CSVParser.cc:631
Hypercube hc
Definition: CSVParser.cc:637
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored
Here is the call graph for this function:

Member Data Documentation

◆ anyVal

template<class P>
vector<AnyVal> minsky::ParseCSV< P >::anyVal

Definition at line 634 of file CSVParser.cc.

◆ dimLabels

template<class P>
vector<unordered_map<typename Key::value_type, size_t> > minsky::ParseCSV< P >::dimLabels

Definition at line 635 of file CSVParser.cc.

Referenced by minsky::loadValueFromCSVFileT().

◆ hc

template<class P>
Hypercube minsky::ParseCSV< P >::hc

Definition at line 637 of file CSVParser.cc.

Referenced by minsky::loadValueFromCSVFileT().

◆ horizontalLabels

template<class P>
vector<typename Key::value_type> minsky::ParseCSV< P >::horizontalLabels

Definition at line 636 of file CSVParser.cc.

◆ sliceLabelTokens

template<class P>
Tokens<SliceLabelToken> minsky::ParseCSV< P >::sliceLabelTokens

Definition at line 633 of file CSVParser.cc.

◆ tmpCnt

template<class P>
Map<int> minsky::ParseCSV< P >::tmpCnt

Definition at line 632 of file CSVParser.cc.

◆ tmpData

template<class P>
Map<double> minsky::ParseCSV< P >::tmpData

map of data by key

Definition at line 631 of file CSVParser.cc.

Referenced by minsky::loadValueFromCSVFileT().


The documentation for this struct was generated from the following file: