Minsky
minsky::DataSpec Class Reference

#include <CSVParser.h>

Inheritance diagram for minsky::DataSpec:
Inheritance graph
Collaboration diagram for minsky::DataSpec:
Collaboration graph

Public Member Functions

std::size_t nRowAxes () const
 start row of the data area More...
 
std::size_t nColAxes () const
 start column of the data area More...
 
DataSpecSchema toSchema ()
 
DataSpecoperator= (const DataSpecSchema &x)
 
void toggleDimension (std::size_t c)
 
void setDataArea (std::size_t row, std::size_t col)
 set top left cell of the data area More...
 
void guessFromStream (std::istream &file, uintmax_t fileSize=uintmax_t(-1))
 initial stab at dataspec from examining stream More...
 
void guessFromFile (const std::string &fileName)
 initial stab at dataspec from examining file More...
 
void populateFromRavelMetadata (const std::string &metadata, const std::string &horizontalName, std::size_t row)
 populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerRow attribute If horizontalName is one of the dimensions, data is written in a tabular format More...
 
const std::vector< size_t > & uniqueValues () const
 number of unique values in each column corrected for header row, so may be slightly inaccurate if header row contains one of the values More...
 
template<class TokenizerFunction >
void givenTFguessRemainder (std::istream &initialInput, std::istream &remainingInput, const TokenizerFunction &tf, uintmax_t fileSize)
 
template<class TokenizerFunction , class UniqueVals >
bool processChunk (std::istream &input, const TokenizerFunction &tf, size_t until, UniqueVals &uniqueVals)
 

Public Attributes

std::size_t maxColumn =1000
 maximum number of columns that can be configured independently. Columns after this limit are treated as "data" More...
 
- Public Attributes inherited from minsky::DataSpecSchema
std::size_t dataRowOffset
 
std::size_t dataColOffset
 
std::size_t numCols =0
 number of columns in CSV. Must be > dataColOffset More...
 
char separator =','
 
char quote ='"'
 
char escape ='\0'
 
char decSeparator ='.'
 
bool mergeDelimiters =false
 
bool counter =false
 count data items, not read their values More...
 
bool dontFail =false
 do not throw an error on corrupt data More...
 
double missingValue =nan("")
 
std::size_t headerRow =0
 number of header rows More...
 
std::string horizontalDimName ="?"
 
civita::Dimension horizontalDimension
 
DuplicateKeyAction duplicateKeyAction =throwException
 
std::set< unsigned > dimensionCols
 rows and columns that are comment lines to be ignored More...
 
std::set< unsigned > dataCols
 
std::vector< civita::Dimension > dimensions
 
std::vector< std::string > dimensionNames
 

Private Member Functions

 CLASSDESC_ACCESS (DataSpec)
 
template<class T >
void givenTFguessRemainder (std::istream &initialInput, std::istream &remainingInput, const T &tf, uintmax_t fileSize)
 try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char> tf(escape,separator,quote) More...
 
void guessRemainder (std::istream &initialInput, std::istream &remainingInput, char separator, uintmax_t fileSize)
 figure out the tokenizer function and call givenTFguessRemainder More...
 
template<class T , class U >
bool processChunk (std::istream &input, const T &tf, size_t until, U &)
 process chunk of input, updating guessed spec More...
 

Private Attributes

std::size_t m_nRowAxes =0
 
std::size_t m_nColAxes =0
 
std::vector< size_t > starts
 
size_t nCols =0
 
size_t row =0
 
size_t firstEmpty =std::numeric_limits<size_t>::max()
 
std::vector< size_t > m_uniqueValues
 number of unique values in each column More...
 

Additional Inherited Members

- Public Types inherited from minsky::DataSpecSchema
enum  DuplicateKeyAction {
  throwException, sum, product, min,
  max, av
}
 what to do with duplicate keys More...
 

Detailed Description

Definition at line 37 of file CSVParser.h.

Member Function Documentation

◆ CLASSDESC_ACCESS()

minsky::DataSpec::CLASSDESC_ACCESS ( DataSpec  )
private

◆ givenTFguessRemainder() [1/2]

template<class T >
void minsky::DataSpec::givenTFguessRemainder ( std::istream &  initialInput,
std::istream &  remainingInput,
const T &  tf,
uintmax_t  fileSize 
)
private

try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char> tf(escape,separator,quote)

◆ givenTFguessRemainder() [2/2]

template<class TokenizerFunction >
void minsky::DataSpec::givenTFguessRemainder ( std::istream &  initialInput,
std::istream &  remainingInput,
const TokenizerFunction &  tf,
uintmax_t  fileSize 
)

Definition at line 390 of file CSVParser.cc.

References minsky::minsky(), minsky::CSVDialog::numInitialLines, and minsky::ProgressUpdater::setProgress().

391 {
392  starts.clear();
393  nCols=0;
394  row=0;
395  firstEmpty=numeric_limits<size_t>::max();
396  m_nRowAxes=0;
397 
398  const BusyCursor busy(minsky());
399  // we don't know how many times we'll be going around the loop here, so pick a largish number for the progress bar
400  ProgressUpdater pu(minsky().progressState,"Guessing CSV format",100);
401 
402  vector<set<size_t,less<size_t>,LibCAllocator<size_t>>> uniqueVals;
403  m_uniqueValues.clear(); // cleared in case of early return
404  try
405  {
406  if (!processChunk(initialInput, tf, CSVDialog::numInitialLines, uniqueVals)) return;
407  do
408  {
409  m_uniqueValues.resize(uniqueVals.size());
410  for (size_t i=0; i<uniqueVals.size(); ++i) m_uniqueValues[i]=uniqueVals[i].size();
411  if (fileSize==-1)
412  ++minsky().progressState;
413  else
414  pu.setProgress(double(remainingInput.tellg())/fileSize);
415  }
416  while (!processChunk(remainingInput, tf, row+CSVDialog::numInitialLines, uniqueVals));
417  m_uniqueValues.resize(uniqueVals.size());
418  for (size_t i=0; i<uniqueVals.size(); ++i) m_uniqueValues[i]=uniqueVals[i].size();
419  if (fileSize==-1)
420  ++minsky().progressState;
421  else
422  pu.setProgress(double(remainingInput.tellg())/fileSize);
423  }
424  catch (std::exception&)
425  {
426  // progressState throws an exception on being cancelled by the user
427  throw std::runtime_error("CSV format guess terminated by user, best guess specification used.");
428  }
429 }
size_t firstEmpty
Definition: CSVParser.h:106
bool processChunk(std::istream &input, const T &tf, size_t until, U &)
process chunk of input, updating guessed spec
std::vector< size_t > starts
Definition: CSVParser.h:103
Minsky & minsky()
global minsky object
Definition: minskyTCL.cc:51
std::size_t m_nRowAxes
Definition: CSVParser.h:39
static const unsigned numInitialLines
Definition: CSVDialog.h:44
std::vector< size_t > m_uniqueValues
number of unique values in each column
Definition: CSVParser.h:109
Here is the call graph for this function:

◆ guessFromFile()

void minsky::DataSpec::guessFromFile ( const std::string &  fileName)
inline

initial stab at dataspec from examining file

Definition at line 80 of file CSVParser.h.

References guessFromStream(), and minsky::stripByteOrderingMarker().

80  {
81  std::ifstream is(fileName);
83  guessFromStream(is, std::filesystem::file_size(fileName));
84  }
void stripByteOrderingMarker(std::istream &s)
checks if the input stream has the UTF-8 byte ordering marker, and removes it if present ...
Definition: str.h:147
void guessFromStream(std::istream &file, uintmax_t fileSize=uintmax_t(-1))
initial stab at dataspec from examining stream
Definition: CSVParser.cc:517
Here is the call graph for this function:

◆ guessFromStream()

void DataSpec::guessFromStream ( std::istream &  file,
uintmax_t  fileSize = uintmax_t(-1) 
)

initial stab at dataspec from examining stream

Definition at line 517 of file CSVParser.cc.

References minsky::CSVDialog::numInitialLines.

Referenced by guessFromFile().

518 {
519  size_t numCommas=0, numSemicolons=0, numTabs=0;
520  size_t row=0;
521  string buf;
522  ostringstream streamBuf;
523  for (; getline(input, buf) && row<CSVDialog::numInitialLines; ++row, streamBuf<<buf<<endl)
524  for (auto c:buf)
525  switch (c)
526  {
527  case ',':
528  numCommas++;
529  break;
530  case ';':
531  numSemicolons++;
532  break;
533  case '\t':
534  numTabs++;
535  break;
536  }
537 
538  {
539  istringstream inputCopy(streamBuf.str());
540  if (numCommas>0.9*row && numCommas>numSemicolons && numCommas>numTabs)
541  guessRemainder(inputCopy,input,',',fileSize);
542  else if (numSemicolons>0.9*row && numSemicolons>numTabs)
543  guessRemainder(inputCopy,input,';',fileSize);
544  else if (numTabs>0.9*row)
545  guessRemainder(inputCopy,input,'\t',fileSize);
546  else
547  guessRemainder(inputCopy,input,' ',fileSize);
548  }
549 }
Definition: input.py:1
void guessRemainder(std::istream &initialInput, std::istream &remainingInput, char separator, uintmax_t fileSize)
figure out the tokenizer function and call givenTFguessRemainder
Definition: CSVParser.cc:431
static const unsigned numInitialLines
Definition: CSVDialog.h:44
Here is the caller graph for this function:

◆ guessRemainder()

void DataSpec::guessRemainder ( std::istream &  initialInput,
std::istream &  remainingInput,
char  separator,
uintmax_t  fileSize 
)
private

figure out the tokenizer function and call givenTFguessRemainder

Definition at line 431 of file CSVParser.cc.

432 {
433  separator=sep;
434  if (separator==' ')
435  givenTFguessRemainder(initialInput, remainingInput, SpaceSeparatorParser(escape,separator,quote),fileSize); //assumes merged whitespace separators
436  else
437  givenTFguessRemainder(initialInput, remainingInput, Parser(escape,separator,quote),fileSize);
438 }
escapedListSeparator::EscapedListSeparator< char > Parser
Definition: CSVParser.cc:155
void givenTFguessRemainder(std::istream &initialInput, std::istream &remainingInput, const T &tf, uintmax_t fileSize)
try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char...

◆ nColAxes()

std::size_t minsky::DataSpec::nColAxes ( ) const
inline

start column of the data area

Definition at line 48 of file CSVParser.h.

References m_nColAxes.

Referenced by minsky::ParseCSV< P >::parse(), and toSchema().

48 {return m_nColAxes;}
std::size_t m_nColAxes
Definition: CSVParser.h:39
Here is the caller graph for this function:

◆ nRowAxes()

std::size_t minsky::DataSpec::nRowAxes ( ) const
inline

start row of the data area

Definition at line 46 of file CSVParser.h.

References m_nRowAxes.

Referenced by minsky::ParseCSV< P >::parse(), minsky::reportFromCSVFileT(), and toSchema().

46 {return m_nRowAxes;}
std::size_t m_nRowAxes
Definition: CSVParser.h:39
Here is the caller graph for this function:

◆ operator=()

DataSpec& minsky::DataSpec::operator= ( const DataSpecSchema x)
inline

Definition at line 57 of file CSVParser.h.

References minsky::DataSpecSchema::dataColOffset, minsky::DataSpecSchema::dataCols, minsky::DataSpecSchema::dataRowOffset, minsky::DataSpecSchema::dimensionCols, and setDataArea().

57  {
58  DataSpecSchema::operator=(x);
60  dimensionCols=x.dimensionCols; // revert clobber by setDataArea
61  dataCols=x.dataCols; // revert clobber by setDataArea
62  return *this;
63  }
std::set< unsigned > dataCols
void setDataArea(std::size_t row, std::size_t col)
set top left cell of the data area
Definition: CSVParser.cc:370
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored
Here is the call graph for this function:

◆ populateFromRavelMetadata()

void DataSpec::populateFromRavelMetadata ( const std::string &  metadata,
const std::string &  horizontalName,
std::size_t  row 
)

populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerRow attribute If horizontalName is one of the dimensions, data is written in a tabular format

Definition at line 551 of file CSVParser.cc.

552  {
553  vector<NamedDimension> ravelMetadata;
554  json(ravelMetadata,metadata);
555  headerRow=row+2;
556  setDataArea(headerRow, ravelMetadata.size());
557  dimensionNames.clear();
558  dimensions.clear();
559  for (auto& i: ravelMetadata)
560  if (i.name==horizontalName)
561  {
562  horizontalDimension=i.dimension;
563  horizontalDimName=i.name;
564  }
565  else
566  {
567  dimensions.push_back(i.dimension);
568  dimensionNames.push_back(i.name);
569  }
570  for (size_t i=0; i<dimensions.size(); ++i)
571  dimensionCols.insert(i);
572  }
civita::Dimension horizontalDimension
std::vector< civita::Dimension > dimensions
std::string horizontalDimName
std::size_t headerRow
number of header rows
std::vector< std::string > dimensionNames
void setDataArea(std::size_t row, std::size_t col)
set top left cell of the data area
Definition: CSVParser.cc:370
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored

◆ processChunk() [1/2]

template<class T , class U >
bool minsky::DataSpec::processChunk ( std::istream &  input,
const T &  tf,
size_t  until,
U &   
)
private

process chunk of input, updating guessed spec

Returns
true if there's no more work to be done

◆ processChunk() [2/2]

template<class TokenizerFunction , class UniqueVals >
bool minsky::DataSpec::processChunk ( std::istream &  input,
const TokenizerFunction &  tf,
size_t  until,
UniqueVals &  uniqueVals 
)

Definition at line 441 of file CSVParser.cc.

References anonymous_namespace{CSVParser.cc}::emptyTail(), and anonymous_namespace{CSVParser.cc}::firstNumerical().

442 {
443  string buf;
444  const hash<string> h;
445  for (; getline(input, buf) && row<until; ++row)
446  {
447  if (buf.empty()) continue;
448  // remove trailing carriage returns
449  if (buf.back()=='\r') buf=buf.substr(0,buf.size()-1);
450  if (!buf.empty())
451  {
452  smatch match;
453  static const regex ravelHypercube("\"RavelHypercube=(.*)\"");
454  if (regex_match(buf, match, ravelHypercube))
455  try
456  {
457  string metadata=match[1];
458  // remove leaning toothpicks
459  metadata.erase(remove(metadata.begin(),metadata.end(),'\\'),metadata.end());
460  string horizontalName;
461  getline(input, buf);
462  static const regex re("HorizontalDimension=\"(.*)\"");
463  if (regex_match(buf, match, re))
464  {
465  horizontalName=match[1];
466  ++row;
467  }
468  populateFromRavelMetadata(metadata, horizontalName, row);
469  return false;
470  }
471  catch (...)
472  {
473  continue; // in case of error, ignore the RavelHypercube line.
474  }
475  }
476  const boost::tokenizer<TokenizerFunction> tok(buf.begin(),buf.end(), tf);
477  const vector<string> line(tok.begin(), tok.end());
478  if (line.size()>uniqueVals.size())
479  uniqueVals.resize(std::min(maxColumn, line.size()));
480  for (size_t i=0; i<std::min(line.size(), uniqueVals.size()); ++i)
481  uniqueVals[i].insert(h(line[i]));
482  starts.push_back(firstNumerical(line));
483  nCols=std::max(nCols, line.size());
484  if (starts.back()==line.size())
485  m_nRowAxes=row;
486  if (starts.size()-1 < firstEmpty && starts.back()<nCols && emptyTail(line, starts.back()))
487  firstEmpty=starts.size()-1;
488  }
489  // compute average of starts, then look for first row that drops below average
490  double sum=0;
491  for (unsigned long i=0; i<starts.size(); ++i)
492  sum+=starts[i];
493  const double av=sum/(starts.size());
494  for (; starts.size()>m_nRowAxes && (starts[m_nRowAxes]>av);
495  ++m_nRowAxes);
496  // if nRowAxes exceeds numInitialLines, assume first row is a header row, and that that is all there is.
497  if (m_nRowAxes>=row-1) m_nRowAxes=1;
498  m_nColAxes=0;
499  for (size_t i=nRowAxes(); i<starts.size(); ++i)
500  m_nColAxes=std::max(m_nColAxes,starts[i]);
501  // if more than 1 data column, treat the first row as an axis row
502  if (m_nRowAxes==0 && nCols-m_nColAxes>1)
503  m_nRowAxes=1;
504 
505  if (firstEmpty==m_nRowAxes) ++m_nRowAxes; // allow for possible colAxes header line
506  headerRow=nRowAxes()>0? nRowAxes()-1: 0;
507  size_t i=0;
508  dimensionCols.clear();
509  for (; i<nColAxes() && i<maxColumn; ++i) dimensionCols.insert(i);
510  dataCols.clear();
511  for (; i<nCols && i<maxColumn; ++i) dataCols.insert(i);
512  return !input;
513 }
std::size_t nColAxes() const
start column of the data area
Definition: CSVParser.h:48
Definition: input.py:1
size_t firstNumerical(const vector< string > &v)
Definition: CSVParser.cc:305
size_t firstEmpty
Definition: CSVParser.h:106
bool emptyTail(const vector< string > &v, size_t start)
Definition: CSVParser.cc:328
std::size_t headerRow
number of header rows
std::size_t m_nColAxes
Definition: CSVParser.h:39
std::vector< size_t > starts
Definition: CSVParser.h:103
std::size_t maxColumn
maximum number of columns that can be configured independently. Columns after this limit are treated ...
Definition: CSVParser.h:43
std::size_t nRowAxes() const
start row of the data area
Definition: CSVParser.h:46
std::set< unsigned > dataCols
std::size_t m_nRowAxes
Definition: CSVParser.h:39
void populateFromRavelMetadata(const std::string &metadata, const std::string &horizontalName, std::size_t row)
populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerR...
Definition: CSVParser.cc:551
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored
Here is the call graph for this function:

◆ setDataArea()

void DataSpec::setDataArea ( std::size_t  row,
std::size_t  col 
)

set top left cell of the data area

Definition at line 370 of file CSVParser.cc.

Referenced by operator=().

371 {
372  m_nRowAxes=row;
373  m_nColAxes=std::min(col, maxColumn);
374  numCols=std::max(numCols, m_nColAxes);
375  if (headerRow>=row)
376  headerRow=row>0? row-1: 0;
377  if (dimensions.size()<nColAxes()) dimensions.resize(nColAxes());
378  if (dimensionNames.size()<nColAxes()) dimensionNames.resize(nColAxes());
379  // remove any dimensionCols > nColAxes
380  dimensionCols.erase(dimensionCols.lower_bound(nColAxes()), dimensionCols.end());
381  // adjust ignored columns
382  for (unsigned i=0; i<m_nColAxes; ++i)
383  dataCols.erase(i);
384  for (unsigned i=m_nColAxes; i<numCols && i<maxColumn; ++i)
385  dataCols.insert(i);
386 }
std::size_t nColAxes() const
start column of the data area
Definition: CSVParser.h:48
std::size_t numCols
number of columns in CSV. Must be > dataColOffset
std::vector< civita::Dimension > dimensions
std::size_t headerRow
number of header rows
std::size_t m_nColAxes
Definition: CSVParser.h:39
std::vector< std::string > dimensionNames
std::size_t maxColumn
maximum number of columns that can be configured independently. Columns after this limit are treated ...
Definition: CSVParser.h:43
std::set< unsigned > dataCols
std::size_t m_nRowAxes
Definition: CSVParser.h:39
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored
Here is the caller graph for this function:

◆ toggleDimension()

void minsky::DataSpec::toggleDimension ( std::size_t  c)
inline

Definition at line 65 of file CSVParser.h.

References minsky::DataSpecSchema::dimensionCols.

65  {
66  auto i=dimensionCols.find(c);
67  if (i==dimensionCols.end())
68  dimensionCols.insert(c);
69  else
70  dimensionCols.erase(i);
71  }
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored

◆ toSchema()

DataSpecSchema minsky::DataSpec::toSchema ( )
inline

Definition at line 51 of file CSVParser.h.

References minsky::DataSpecSchema::dataColOffset, minsky::DataSpecSchema::dataRowOffset, nColAxes(), and nRowAxes().

51  {
54  return *this;
55  }
std::size_t nColAxes() const
start column of the data area
Definition: CSVParser.h:48
std::size_t nRowAxes() const
start row of the data area
Definition: CSVParser.h:46
Here is the call graph for this function:

◆ uniqueValues()

const std::vector<size_t>& minsky::DataSpec::uniqueValues ( ) const
inline

number of unique values in each column corrected for header row, so may be slightly inaccurate if header row contains one of the values

Definition at line 92 of file CSVParser.h.

References m_uniqueValues.

92 {return m_uniqueValues;}
std::vector< size_t > m_uniqueValues
number of unique values in each column
Definition: CSVParser.h:109

Member Data Documentation

◆ firstEmpty

size_t minsky::DataSpec::firstEmpty =std::numeric_limits<size_t>::max()
private

Definition at line 106 of file CSVParser.h.

◆ m_nColAxes

std::size_t minsky::DataSpec::m_nColAxes =0
private

Definition at line 39 of file CSVParser.h.

Referenced by nColAxes().

◆ m_nRowAxes

std::size_t minsky::DataSpec::m_nRowAxes =0
private

Definition at line 39 of file CSVParser.h.

Referenced by nRowAxes().

◆ m_uniqueValues

std::vector<size_t> minsky::DataSpec::m_uniqueValues
private

number of unique values in each column

Definition at line 109 of file CSVParser.h.

Referenced by uniqueValues().

◆ maxColumn

std::size_t minsky::DataSpec::maxColumn =1000

maximum number of columns that can be configured independently. Columns after this limit are treated as "data"

Definition at line 43 of file CSVParser.h.

Referenced by minsky::ParseCSV< P >::parse().

◆ nCols

size_t minsky::DataSpec::nCols =0
private

Definition at line 104 of file CSVParser.h.

◆ row

size_t minsky::DataSpec::row =0
private

Definition at line 105 of file CSVParser.h.

◆ starts

std::vector<size_t> minsky::DataSpec::starts
private

Definition at line 103 of file CSVParser.h.


The documentation for this class was generated from the following files: