Minsky
|
#include <CSVParser.h>
Public Member Functions | |
std::size_t | nRowAxes () const |
start row of the data area More... | |
std::size_t | nColAxes () const |
start column of the data area More... | |
DataSpecSchema | toSchema () |
DataSpec & | operator= (const DataSpecSchema &x) |
void | toggleDimension (std::size_t c) |
void | setDataArea (std::size_t row, std::size_t col) |
set top left cell of the data area More... | |
void | guessFromStream (std::istream &file, uintmax_t fileSize=uintmax_t(-1)) |
initial stab at dataspec from examining stream More... | |
void | guessFromFile (const std::string &fileName) |
initial stab at dataspec from examining file More... | |
void | populateFromRavelMetadata (const std::string &metadata, const std::string &horizontalName, std::size_t row) |
populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerRow attribute If horizontalName is one of the dimensions, data is written in a tabular format More... | |
const std::vector< size_t > & | uniqueValues () const |
number of unique values in each column corrected for header row, so may be slightly inaccurate if header row contains one of the values More... | |
template<class TokenizerFunction > | |
void | givenTFguessRemainder (std::istream &initialInput, std::istream &remainingInput, const TokenizerFunction &tf, uintmax_t fileSize) |
template<class TokenizerFunction , class UniqueVals > | |
bool | processChunk (std::istream &input, const TokenizerFunction &tf, size_t until, UniqueVals &uniqueVals) |
Public Attributes | |
std::size_t | maxColumn =1000 |
maximum number of columns that can be configured independently. Columns after this limit are treated as "data" More... | |
![]() | |
std::size_t | dataRowOffset |
std::size_t | dataColOffset |
std::size_t | numCols =0 |
number of columns in CSV. Must be > dataColOffset More... | |
char | separator =',' |
char | quote ='"' |
char | escape ='\0' |
char | decSeparator ='.' |
bool | mergeDelimiters =false |
bool | counter =false |
count data items, not read their values More... | |
bool | dontFail =false |
do not throw an error on corrupt data More... | |
double | missingValue =nan("") |
std::size_t | headerRow =0 |
number of header rows More... | |
std::string | horizontalDimName ="?" |
civita::Dimension | horizontalDimension |
DuplicateKeyAction | duplicateKeyAction =throwException |
std::set< unsigned > | dimensionCols |
rows and columns that are comment lines to be ignored More... | |
std::set< unsigned > | dataCols |
std::vector< civita::Dimension > | dimensions |
std::vector< std::string > | dimensionNames |
Private Member Functions | |
CLASSDESC_ACCESS (DataSpec) | |
template<class T > | |
void | givenTFguessRemainder (std::istream &initialInput, std::istream &remainingInput, const T &tf, uintmax_t fileSize) |
try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char> tf(escape,separator,quote) More... | |
void | guessRemainder (std::istream &initialInput, std::istream &remainingInput, char separator, uintmax_t fileSize) |
figure out the tokenizer function and call givenTFguessRemainder More... | |
template<class T , class U > | |
bool | processChunk (std::istream &input, const T &tf, size_t until, U &) |
process chunk of input, updating guessed spec More... | |
Private Attributes | |
std::size_t | m_nRowAxes =0 |
std::size_t | m_nColAxes =0 |
std::vector< size_t > | starts |
size_t | nCols =0 |
size_t | row =0 |
size_t | firstEmpty =std::numeric_limits<size_t>::max() |
std::vector< size_t > | m_uniqueValues |
number of unique values in each column More... | |
Additional Inherited Members | |
![]() | |
enum | DuplicateKeyAction { throwException, sum, product, min, max, av } |
what to do with duplicate keys More... | |
Definition at line 37 of file CSVParser.h.
|
private |
|
private |
try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char> tf(escape,separator,quote)
void minsky::DataSpec::givenTFguessRemainder | ( | std::istream & | initialInput, |
std::istream & | remainingInput, | ||
const TokenizerFunction & | tf, | ||
uintmax_t | fileSize | ||
) |
Definition at line 390 of file CSVParser.cc.
References minsky::minsky(), minsky::CSVDialog::numInitialLines, and minsky::ProgressUpdater::setProgress().
|
inline |
initial stab at dataspec from examining file
Definition at line 80 of file CSVParser.h.
References guessFromStream(), and minsky::stripByteOrderingMarker().
void DataSpec::guessFromStream | ( | std::istream & | file, |
uintmax_t | fileSize = uintmax_t(-1) |
||
) |
initial stab at dataspec from examining stream
Definition at line 517 of file CSVParser.cc.
References minsky::CSVDialog::numInitialLines.
Referenced by guessFromFile().
|
private |
figure out the tokenizer function and call givenTFguessRemainder
Definition at line 431 of file CSVParser.cc.
|
inline |
start column of the data area
Definition at line 48 of file CSVParser.h.
References m_nColAxes.
Referenced by minsky::ParseCSV< P >::parse(), and toSchema().
|
inline |
start row of the data area
Definition at line 46 of file CSVParser.h.
References m_nRowAxes.
Referenced by minsky::ParseCSV< P >::parse(), minsky::reportFromCSVFileT(), and toSchema().
|
inline |
Definition at line 57 of file CSVParser.h.
References minsky::DataSpecSchema::dataColOffset, minsky::DataSpecSchema::dataCols, minsky::DataSpecSchema::dataRowOffset, minsky::DataSpecSchema::dimensionCols, and setDataArea().
void DataSpec::populateFromRavelMetadata | ( | const std::string & | metadata, |
const std::string & | horizontalName, | ||
std::size_t | row | ||
) |
populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerRow attribute If horizontalName is one of the dimensions, data is written in a tabular format
Definition at line 551 of file CSVParser.cc.
|
private |
process chunk of input, updating guessed spec
bool minsky::DataSpec::processChunk | ( | std::istream & | input, |
const TokenizerFunction & | tf, | ||
size_t | until, | ||
UniqueVals & | uniqueVals | ||
) |
Definition at line 441 of file CSVParser.cc.
References anonymous_namespace{CSVParser.cc}::emptyTail(), and anonymous_namespace{CSVParser.cc}::firstNumerical().
void DataSpec::setDataArea | ( | std::size_t | row, |
std::size_t | col | ||
) |
set top left cell of the data area
Definition at line 370 of file CSVParser.cc.
Referenced by operator=().
|
inline |
Definition at line 65 of file CSVParser.h.
References minsky::DataSpecSchema::dimensionCols.
|
inline |
Definition at line 51 of file CSVParser.h.
References minsky::DataSpecSchema::dataColOffset, minsky::DataSpecSchema::dataRowOffset, nColAxes(), and nRowAxes().
|
inline |
number of unique values in each column corrected for header row, so may be slightly inaccurate if header row contains one of the values
Definition at line 92 of file CSVParser.h.
References m_uniqueValues.
|
private |
Definition at line 106 of file CSVParser.h.
|
private |
Definition at line 39 of file CSVParser.h.
Referenced by nColAxes().
|
private |
Definition at line 39 of file CSVParser.h.
Referenced by nRowAxes().
|
private |
number of unique values in each column
Definition at line 109 of file CSVParser.h.
Referenced by uniqueValues().
std::size_t minsky::DataSpec::maxColumn =1000 |
maximum number of columns that can be configured independently. Columns after this limit are treated as "data"
Definition at line 43 of file CSVParser.h.
Referenced by minsky::ParseCSV< P >::parse().
|
private |
Definition at line 104 of file CSVParser.h.
|
private |
Definition at line 105 of file CSVParser.h.
|
private |
Definition at line 103 of file CSVParser.h.