Collaboration diagram for minsky::ParseCSV:

Public Member Functions
template<class E >
	ParseCSV (istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)

template<class E >
	ParseCSV (const vector< string > &filenames, const DataSpec &spec, uintmax_t, E &onError, bool checkValues=false)

template<class E >
void	parse (istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)

Public Attributes
Map< double >	tmpData
	map of data by key More...

Map< int >	tmpCnt

Tokens< SliceLabelToken >	sliceLabelTokens

vector< AnyVal >	anyVal

vector< unordered_map< typename Key::value_type, size_t > >	dimLabels

vector< typename Key::value_type >	horizontalLabels

Hypercube	hc

Detailed Description

template<class P>
struct minsky::ParseCSV

Definition at line 629 of file CSVParser.cc.

Constructor & Destructor Documentation

◆ ParseCSV() [1/2]

template<class P>

template<class E >

minsky::ParseCSV< P >::ParseCSV	(	istream &	input,
		const DataSpec &	spec,
		uintmax_t	fileSize,
		E &	onError,
		bool	checkValues = `false`
	)

inline

Definition at line 640 of file CSVParser.cc.

     {
       //istream inputs do not support progress bars
       parse(input,spec,fileSize,onError,checkValues);
     }

◆ ParseCSV() [2/2]

template<class P>

template<class E >

minsky::ParseCSV< P >::ParseCSV	(	const vector< string > &	filenames,
		const DataSpec &	spec,
		uintmax_t	,
		E &	onError,
		bool	checkValues = `false`
	)

inline

Definition at line 647 of file CSVParser.cc.

References f, and minsky::minsky().

     {
       const ProgressUpdater pu(minsky().progressState, "Reading files",filenames.size());
       for (auto& f: filenames)
         {
           ifstream input(f);
           try
             {
               parse(input,spec,std::filesystem::file_size(f),onError);
             }
           catch (const std::exception& ex)
             {
               // prepend filename here
               throw std::runtime_error(f+": "+ex.what());
             }
           ++minsky().progressState;
         }
     }

Here is the call graph for this function:

Member Function Documentation

◆ parse()

template<class P>

template<class E >

void minsky::ParseCSV< P >::parse	(	istream &	input,
		const DataSpec &	spec,
		uintmax_t	fileSize,
		E &	onError,
		bool	checkValues = `false`
	)

inline

Definition at line 667 of file CSVParser.cc.

     {
       const BusyCursor busy(minsky());
       const ProgressUpdater pu(minsky().progressState, "Parsing CSV",2);
 
       dimLabels.resize(spec.dimensionCols.size());
       
       size_t row=0, col=0;
       P csvParser(spec.escape,spec.separator,spec.quote);
       string buf;
       const bool tabularFormat=spec.dataCols.size()>1 || (spec.dataCols.empty() && spec.numCols>spec.nColAxes()+1);
       uintmax_t bytesRead=0;
       
       try
         {
           if (hc.xvectors.empty()) // only set this first time around
             {
               for (auto i: spec.dimensionCols)
                 {
                   hc.xvectors.push_back(i<spec.dimensionNames.size()? spec.dimensionNames[i]: "dim"+str(i));
                   hc.xvectors.back().dimension=spec.dimensions[i];
                   anyVal.emplace_back(spec.dimensions[i]);
                 }
               ++minsky().progressState;
               if (tabularFormat)
                 {
                   anyVal.emplace_back(spec.horizontalDimension);
                   // legacy situation where all data columns are to the right
                   if (spec.dataCols.empty())
                     for (size_t i=spec.nColAxes(); i<spec.dimensionNames.size(); ++i)
                       {
                         col=i;
                         horizontalLabels.emplace_back(sliceLabelTokens[str(anyVal.back()(spec.dimensionNames[i]),spec.horizontalDimension.units)]);
                       }
                   else
                     {
                       // explicitly specified data columns
                       for (auto i: spec.dataCols)
                         {
                           col=i;
                           horizontalLabels.emplace_back(sliceLabelTokens[str(anyVal.back()(spec.dimensionNames[i]),spec.horizontalDimension.units)]);
                         }
                     
                       if (spec.headerRow<spec.nRowAxes())
                         {
                           // check whether any further columns exist that are not in
                           // spec.dimensionNames, and add these in as horizontal
                           // data dimension slices
                           for (; row<=spec.headerRow; ++row)
                             getWholeLine(input,buf,spec);
                           const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
                           auto field=tok.begin();
                           for (size_t i=0; i<spec.dimensionNames.size() && field!=tok.end(); ++i, ++field);
                           for (; field!=tok.end(); ++field)
                             if (field->empty())
                               horizontalLabels.emplace_back(sliceLabelTokens[""]);
                             else
                               horizontalLabels.emplace_back
                                 (sliceLabelTokens[str(anyVal.back()(*field),spec.horizontalDimension.units)]);
                         }
                     }
             
                   hc.xvectors.emplace_back(spec.horizontalDimName);
                   hc.xvectors.back().dimension=spec.horizontalDimension;
                   set<typename Key::value_type> uniqueLabels;
                   dimLabels.emplace_back();
                   for (auto& i: horizontalLabels)
                     if (!sliceLabelTokens[i].empty() && uniqueLabels.insert(i).second)
                       {
                         dimLabels.back()[i]=hc.xvectors.back().size();
                         hc.xvectors.back().emplace_back(sliceLabelTokens[i]);
                       }
                 }
             }
              
 
           for (; row<spec.nRowAxes(); ++row)
             getWholeLine(input,buf,spec);
             
         
           ++minsky().progressState;
 
           {
             auto blankToken=sliceLabelTokens[""];
             ProgressUpdater pu(minsky().progressState, "Reading data",1);
             for (; getWholeLine(input, buf, spec); ++row)
               {
                 const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
 
                 Key key;
                 size_t dim=0, dataCols=0;
                 col=0;
                 for (auto field=tok.begin(); field!=tok.end(); ++col, ++field)
                   if (spec.dimensionCols.contains(col))
                     {
                       // detect blank data lines (favourite Excel artifact)
                       if (spec.dimensions[dim].type!=Dimension::string && field->empty())
                         goto invalidKeyGotoNextLine;
                   
                       if (dim>=hc.xvectors.size())
                         hc.xvectors.emplace_back("?"); // no header present
                       try
                         {
                           auto trimmedField=trimWS(*field);
                           if (trimmedField.empty() && spec.dimensions[col].type!=Dimension::string)
                             onError(InvalidData("<empty>",to_string(spec.dimensions[col].type),spec.dimensionNames[col]),row);
                           auto keyElem=anyVal[dim](trimmedField);
                           auto skeyElem=str(keyElem, spec.dimensions[dim].units);
                           if (dimLabels[dim].emplace(sliceLabelTokens[skeyElem], dimLabels[dim].size()).second)
                             hc.xvectors[dim].emplace_back(keyElem);
                           key.emplace_back(sliceLabelTokens[skeyElem]);
                         }
                       catch (...)
                         {
                           if (spec.dontFail)
                             goto invalidKeyGotoNextLine;
                           onError(InvalidData(*field,to_string(spec.dimensions[col].type),spec.dimensionNames[col]),row);
                         }
                       dim++;
                     }
 
                 if (key.size()<hc.rank()-tabularFormat)
                   {
                     if (spec.dontFail)
                       goto invalidKeyGotoNextLine;
                     onError(ShortLine(key,sliceLabelTokens),row);
                   }
 
                 col=0;
                 for (auto field=tok.begin(); field!=tok.end(); ++col,++field)
                   if ((spec.dataCols.empty() && col>=spec.nColAxes()) || spec.dataCols.contains(col) || col>=spec.maxColumn) 
                     {
                       if (tabularFormat)
                         {
                           if (horizontalLabels[dataCols]==blankToken)
                             continue; // ignore blank labelled columns
                           key.emplace_back(horizontalLabels[dataCols]);
                         }
                       else if (dataCols)
                         break; // only 1 value column, everything to right ignored
                   
                       // remove thousands separators, and set decimal separator to '.' ("C" locale)
                       string s;
                       for (auto c: *field)
                         if (c==spec.decSeparator)
                           s+='.';
                         else if (!checkValues &&
                                  ((s.empty() && (!isdigit(c)&&c!='-'&&c!='+')) ||
                                   ((s=="-"||s=="+") && !isdigit(c))))
                           continue; // skip non-numeric prefix
                         else if (!isspace(c) && c!='.' && c!=',')
                           s+=c;                    
                   
                       // TODO - this disallows special floating point values - is this right?
                       bool valueExists=!s.empty() && s!="\\N" && (isdigit(s[0])||s[0]=='-'||s[0]=='+'||s[0]=='.');
                       if (checkValues && !valueExists && !s.empty() && s!="\\N") // ignore empty cells or explicit nulls
                         onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
                       
                       if (valueExists || !isnan(spec.missingValue))
                         {
                           if (spec.counter)
                             tmpData[key]+=1;
                           else
                             {
                               auto i=tmpData.find(key);
                               double v=spec.missingValue;
                               if (valueExists)
                                 try
                                   {
                                     size_t end;
                                     v=stod(s,&end);
                                     if (checkValues && end<s.length())
                                       onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
                                     if (i==tmpData.end())
                                       {
                                         tmpData.emplace(key,v);
                                         onError.rowKeyInsert(key,row);
                                       }
                                   }
                                 catch (const std::bad_alloc&)
                                   {throw;}
                                 catch (...) // value misunderstood
                                   {
                                     if (checkValues) onError(InvalidData(s,"value",spec.dimensionNames[col]),row);
                                     if (isnan(spec.missingValue)) // if spec.missingValue is NaN, then don't populate the tmpData map
                                       valueExists=false;
                                   }
                               if (valueExists && i!=tmpData.end())
                                 switch (spec.duplicateKeyAction)
                                   {
                                   case DataSpec::throwException:
                                     if (!spec.dontFail)
                                       onError(DuplicateKey(key,sliceLabelTokens),row); 
                                   case DataSpec::sum:
                                     i->second+=v;
                                     break;
                                   case DataSpec::product:
                                     i->second*=v;
                                     break;
                                   case DataSpec::min:
                                     if (v<i->second)
                                       i->second=v;
                                     break;
                                   case DataSpec::max:
                                     if (v>i->second)
                                       i->second=v;
                                     break;
                                   case DataSpec::av:
                                     {
                                       int& c=tmpCnt[key]; // c initialised to 0
                                       i->second=((c+1)*i->second + v)/(c+2);
                                       c++;
                                     }
                                     break;
                                   }
                             }
                         }
                       dataCols++;
                       if (tabularFormat)
                         key.pop_back();
                       else
                         break; // only one column of data needs to be read
                     }
             
                 if (!dataCols)
                   {
                     if (spec.counter || spec.dontFail)
                       tmpData[key]+=1;
                     else
                       onError(ShortLine(key,sliceLabelTokens),row);
                   }
             
 
                 bytesRead+=buf.size();
                 pu.setProgress(double(bytesRead)/fileSize);
               invalidKeyGotoNextLine:;
               }
           }
         }
       catch (const std::bad_alloc&)
         { // replace with a more user friendly error message
           throw MemoryExhausted();
         }
       catch (const std::length_error&)
         { // replace with a more user friendly error message
           throw MemoryExhausted();
         }
       catch (const std::exception& ex)
         {
           auto msg=string(ex.what())+" at line:"+to_string(row)+", col:"+to_string(col);
           if (col<spec.dimensionNames.size())
             msg+=" ("+spec.dimensionNames[col]+")";
           throw std::runtime_error(msg);
         }
     }

Here is the call graph for this function:

Member Data Documentation

◆ anyVal

template<class P>

vector<AnyVal> minsky::ParseCSV::anyVal

Definition at line 634 of file CSVParser.cc.

◆ dimLabels

template<class P>

vector<unordered_map<typename Key::value_type, size_t> > minsky::ParseCSV::dimLabels

Definition at line 635 of file CSVParser.cc.

Referenced by minsky::loadValueFromCSVFileT().

◆ hc

template<class P>

Hypercube minsky::ParseCSV::hc

Definition at line 637 of file CSVParser.cc.

Referenced by minsky::loadValueFromCSVFileT().

◆ horizontalLabels

template<class P>

vector<typename Key::value_type> minsky::ParseCSV::horizontalLabels

Definition at line 636 of file CSVParser.cc.

◆ sliceLabelTokens

template<class P>

Tokens<SliceLabelToken> minsky::ParseCSV::sliceLabelTokens

Definition at line 633 of file CSVParser.cc.

◆ tmpCnt

template<class P>

Map<int> minsky::ParseCSV::tmpCnt

Definition at line 632 of file CSVParser.cc.

◆ tmpData

template<class P>

Map<double> minsky::ParseCSV::tmpData

map of data by key

Definition at line 631 of file CSVParser.cc.

Referenced by minsky::loadValueFromCSVFileT().

The documentation for this struct was generated from the following file:

engine/CSVParser.cc

Public Member Functions

Public Attributes

Detailed Description

template<class P> struct minsky::ParseCSV< P >

Constructor & Destructor Documentation

◆ ParseCSV() [1/2]

◆ ParseCSV() [2/2]

Member Function Documentation

◆ parse()

Member Data Documentation

◆ anyVal

◆ dimLabels

◆ hc

◆ horizontalLabels

◆ sliceLabelTokens

◆ tmpCnt

◆ tmpData

template<class P>
struct minsky::ParseCSV< P >