23 #include "CSVParser.rcd" 24 #include "dataSpecSchema.rcd" 25 #include "dimension.rcd" 30 #include <memoryapi.h> 34 #include <sys/resource.h> 40 #include <boost/type_traits.hpp> 41 #include <boost/tokenizer.hpp> 42 #include <boost/token_functions.hpp> 43 #include <boost/pool/pool.hpp> 49 class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
58 return Traits::eq(e_,c);
68 return std::find_if(escape_.begin(),escape_.end(),
f)!=escape_.end();
72 return std::find_if(c_.begin(),c_.end(),
f)!=c_.end();
76 return std::find_if(quote_.begin(),quote_.end(),
f)!=quote_.end();
78 template <
typename iterator,
typename Token>
79 void do_escape(iterator& next,iterator end,Token& tok) {
83 if (Traits::eq(*next,
'n')) {
87 if (is_quote(*next)) {
95 if (is_escape(*next)) {
100 tok+=escape_.front()+*next;
106 Char c =
',',Char q =
'\"')
107 : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
110 : escape_(e), c_(c), quote_(q), last_(false) { }
114 template <
typename InputIterator,
typename Token>
115 bool operator()(InputIterator& next,InputIterator end,Token& tok) {
116 bool bInQuote =
false;
129 if (is_escape(*next)) {
130 do_escape(next,end,tok);
132 else if (is_c(*next)) {
143 else if (is_quote(*next)) {
163 escape(escape), quote(quote) {}
173 else if (*next==quote)
175 else if (!quoted && isspace(*next))
177 while (isspace(*next)) ++next;
195 Any(
const any& x): any(x), hash(x.hash()) {}
196 bool operator<(
const Any& x)
const {
return static_cast<const any&
>(*this)<x;}
197 bool operator==(
const Any& x)
const {
return static_cast<const any&
>(*this)==
static_cast<const any&
>(x);}
210 auto i=tokens.find(x);
213 i=tokens.emplace(x, tokenRefs.size()).first;
214 tokenRefs.push_back(&(i->first));
219 if (i<tokenRefs.size())
return *tokenRefs[i];
225 using Key=vector<SliceLabelToken, LibCAllocator<SliceLabelToken>>;
226 template <
class V>
using Map=map<Key,V,less<Key>,LibCAllocator<pair<const Key,V>>>;
230 const char*
what() const noexcept
override {
return "No data columns specified\nIf dataset has no data, try selecting counter";}
235 std::string msg=
"Duplicate key";
240 msg+=
"\nTry selecting a different duplicate key action";
242 const char*
what() const noexcept
override {
return msg.c_str();}
251 InvalidData(
const string& data,
const string& type,
const string& colName):
252 data(data), type(type), colName(colName)
253 {msg=
"Invalid data: "+data+
" for "+type+
" dimensioned column: "+colName;}
254 const char*
what() const noexcept
override {
return msg.c_str();}
259 std::string msg=
"Short line";
264 const char*
what() const noexcept
override {
return msg.c_str();}
269 const char*
what() const noexcept
override {
return "exhausted memory - try reducing the rank";}
275 if (!s.empty() && s[0]==s[s.size()-1] && !isalnum(s[0]))
277 const double r=
quotedStoD(s.substr(1,s.size()-2),charsProcd);
281 if (s.empty())
return nan(
"");
284 const double r=stod(s,&charsProcd);
285 if (charsProcd==s.size())
290 auto n=s.find_first_of(
"0123456789,.+-");
291 return stod(s.substr(n),&charsProcd);
298 if (!isspace(c) && c!=
',' && c!=
'.')
308 for (
size_t i=0; i<v.size(); ++i)
330 for (
size_t i=start; i<v.size(); ++i)
331 if (!v[i].empty())
return false;
346 catch (...) {
return false;}
347 return charsProcd==stripped.size();
359 struct hash<vector<T>>
363 for (
auto& i: x) r^=std::hash<T>()(i);
373 m_nColAxes=std::min(col, maxColumn);
374 numCols=std::max(numCols, m_nColAxes);
376 headerRow=row>0? row-1: 0;
377 if (dimensions.size()<nColAxes()) dimensions.resize(nColAxes());
378 if (dimensionNames.size()<nColAxes()) dimensionNames.resize(nColAxes());
380 dimensionCols.erase(dimensionCols.lower_bound(nColAxes()), dimensionCols.end());
382 for (
unsigned i=0; i<m_nColAxes; ++i)
384 for (
unsigned i=m_nColAxes; i<numCols && i<maxColumn; ++i)
389 template <
class TokenizerFunction>
395 firstEmpty=numeric_limits<size_t>::max();
402 vector<set<size_t,less<size_t>,LibCAllocator<size_t>>> uniqueVals;
403 m_uniqueValues.clear();
409 m_uniqueValues.resize(uniqueVals.size());
410 for (
size_t i=0; i<uniqueVals.size(); ++i) m_uniqueValues[i]=uniqueVals[i].size();
414 pu.
setProgress(
double(remainingInput.tellg())/fileSize);
417 m_uniqueValues.resize(uniqueVals.size());
418 for (
size_t i=0; i<uniqueVals.size(); ++i) m_uniqueValues[i]=uniqueVals[i].size();
422 pu.
setProgress(
double(remainingInput.tellg())/fileSize);
424 catch (std::exception&)
427 throw std::runtime_error(
"CSV format guess terminated by user, best guess specification used.");
435 givenTFguessRemainder(initialInput, remainingInput,
SpaceSeparatorParser(escape,separator,quote),fileSize);
437 givenTFguessRemainder(initialInput, remainingInput,
Parser(escape,separator,quote),fileSize);
440 template <
class TokenizerFunction,
class UniqueVals>
444 const hash<string> h;
445 for (; getline(
input, buf) && row<until; ++row)
447 if (buf.empty())
continue;
449 if (buf.back()==
'\r') buf=buf.substr(0,buf.size()-1);
453 static const regex ravelHypercube(
"\"RavelHypercube=(.*)\"");
454 if (regex_match(buf, match, ravelHypercube))
457 string metadata=match[1];
459 metadata.erase(
remove(metadata.begin(),metadata.end(),
'\\'),metadata.end());
460 string horizontalName;
462 static const regex re(
"HorizontalDimension=\"(.*)\"");
463 if (regex_match(buf, match, re))
465 horizontalName=match[1];
468 populateFromRavelMetadata(metadata, horizontalName, row);
476 const boost::tokenizer<TokenizerFunction> tok(buf.begin(),buf.end(), tf);
477 const vector<string> line(tok.begin(), tok.end());
478 if (line.size()>uniqueVals.size())
479 uniqueVals.resize(std::min(maxColumn, line.size()));
480 for (
size_t i=0; i<std::min(line.size(), uniqueVals.size()); ++i)
481 uniqueVals[i].insert(h(line[i]));
483 nCols=std::max(nCols, line.size());
484 if (starts.back()==line.size())
486 if (starts.size()-1 < firstEmpty && starts.back()<nCols &&
emptyTail(line, starts.back()))
487 firstEmpty=starts.size()-1;
491 for (
unsigned long i=0; i<starts.size(); ++i)
493 const double av=sum/(starts.size());
494 for (; starts.size()>m_nRowAxes && (starts[m_nRowAxes]>av);
497 if (m_nRowAxes>=row-1) m_nRowAxes=1;
499 for (
size_t i=nRowAxes(); i<starts.size(); ++i)
500 m_nColAxes=std::max(m_nColAxes,starts[i]);
502 if (m_nRowAxes==0 && nCols-m_nColAxes>1)
505 if (firstEmpty==m_nRowAxes) ++m_nRowAxes;
506 headerRow=nRowAxes()>0? nRowAxes()-1: 0;
508 dimensionCols.clear();
509 for (; i<nColAxes() && i<maxColumn; ++i) dimensionCols.insert(i);
511 for (; i<nCols && i<maxColumn; ++i) dataCols.insert(i);
519 size_t numCommas=0, numSemicolons=0, numTabs=0;
522 ostringstream streamBuf;
539 istringstream inputCopy(streamBuf.str());
540 if (numCommas>0.9*row && numCommas>numSemicolons && numCommas>numTabs)
541 guessRemainder(inputCopy,
input,
',',fileSize);
542 else if (numSemicolons>0.9*row && numSemicolons>numTabs)
543 guessRemainder(inputCopy,
input,
';',fileSize);
544 else if (numTabs>0.9*row)
545 guessRemainder(inputCopy,
input,
'\t',fileSize);
547 guessRemainder(inputCopy,
input,
' ',fileSize);
553 vector<NamedDimension> ravelMetadata;
554 json(ravelMetadata,metadata);
556 setDataArea(headerRow, ravelMetadata.size());
557 dimensionNames.clear();
559 for (
auto& i: ravelMetadata)
560 if (i.name==horizontalName)
562 horizontalDimension=i.dimension;
563 horizontalDimName=i.name;
567 dimensions.push_back(i.dimension);
568 dimensionNames.push_back(i.name);
570 for (
size_t i=0; i<dimensions.size(); ++i)
571 dimensionCols.insert(i);
579 if (!buf.empty() && buf.back()==
'\r')
580 buf.erase(buf.size()-1);
587 bool r=getline(
input,line).good();
595 if (quoteCount%2==0)
break;
597 r=getline(
input,buf).good();
602 return r || !line.empty();
608 for (
size_t i=1; i<line.size(); ++i)
609 if (line[i]==spec.
quote && line[i-1]==spec.
quote &&
610 ((i==1 && (i==line.size()-1|| line[i+1]!=spec.
quote)) ||
612 ((line[i-2]!=spec.
quote && line[i-2]!=spec.
escape &&
613 (line[i-2]!=spec.
separator || i==line.size()-1|| line[i+1]!=spec.
quote))
623 template <
class E>
void operator()(
const E& ex,
size_t row) {
throw ex;}
635 vector<unordered_map<typename Key::value_type, size_t>>
dimLabels;
643 parse(
input,spec,fileSize,onError,checkValues);
647 ParseCSV(
const vector<string>& filenames,
const DataSpec& spec, uintmax_t, E& onError,
bool checkValues=
false)
650 for (
auto&
f: filenames)
655 parse(
input,spec,std::filesystem::file_size(
f),onError);
657 catch (
const std::exception& ex)
660 throw std::runtime_error(
f+
": "+ex.what());
667 void parse(istream&
input,
const DataSpec& spec, uintmax_t fileSize, E& onError,
bool checkValues=
false)
678 uintmax_t bytesRead=0;
682 if (hc.xvectors.empty())
687 hc.xvectors.back().dimension=spec.
dimensions[i];
717 const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
718 auto field=tok.begin();
719 for (
size_t i=0; i<spec.
dimensionNames.size() && field!=tok.end(); ++i, ++field);
720 for (; field!=tok.end(); ++field)
722 horizontalLabels.emplace_back(sliceLabelTokens[
""]);
724 horizontalLabels.emplace_back
731 set<typename Key::value_type> uniqueLabels;
732 dimLabels.emplace_back();
733 for (
auto& i: horizontalLabels)
734 if (!sliceLabelTokens[i].empty() && uniqueLabels.insert(i).second)
736 dimLabels.back()[i]=hc.xvectors.back().size();
737 hc.xvectors.back().emplace_back(sliceLabelTokens[i]);
750 auto blankToken=sliceLabelTokens[
""];
754 const boost::tokenizer<P> tok(buf.begin(), buf.end(), csvParser);
757 size_t dim=0, dataCols=0;
759 for (
auto field=tok.begin(); field!=tok.end(); ++col, ++field)
763 if (spec.
dimensions[dim].type!=Dimension::string && field->empty())
764 goto invalidKeyGotoNextLine;
766 if (dim>=hc.xvectors.size())
767 hc.xvectors.emplace_back(
"?");
770 auto trimmedField=
trimWS(*field);
771 if (trimmedField.empty() && spec.
dimensions[col].type!=Dimension::string)
773 auto keyElem=anyVal[dim](trimmedField);
775 if (dimLabels[dim].emplace(sliceLabelTokens[skeyElem], dimLabels[dim].size()).second)
776 hc.xvectors[dim].emplace_back(keyElem);
777 key.emplace_back(sliceLabelTokens[skeyElem]);
782 goto invalidKeyGotoNextLine;
788 if (key.size()<hc.rank()-tabularFormat)
791 goto invalidKeyGotoNextLine;
792 onError(ShortLine(key,sliceLabelTokens),row);
796 for (
auto field=tok.begin(); field!=tok.end(); ++col,++field)
801 if (horizontalLabels[dataCols]==blankToken)
803 key.emplace_back(horizontalLabels[dataCols]);
813 else if (!checkValues &&
814 ((s.empty() && (!isdigit(c)&&c!=
'-'&&c!=
'+')) ||
815 ((s==
"-"||s==
"+") && !isdigit(c))))
817 else if (!isspace(c) && c!=
'.' && c!=
',')
821 bool valueExists=!s.empty() && s!=
"\\N" && (isdigit(s[0])||s[0]==
'-'||s[0]==
'+'||s[0]==
'.');
822 if (checkValues && !valueExists && !s.empty() && s!=
"\\N")
831 auto i=tmpData.find(key);
838 if (checkValues && end<s.length())
840 if (i==tmpData.end())
842 tmpData.emplace(key,v);
843 onError.rowKeyInsert(key,row);
846 catch (
const std::bad_alloc&)
850 if (checkValues) onError(InvalidData(s,
"value",spec.
dimensionNames[col]),row);
854 if (valueExists && i!=tmpData.end())
859 onError(DuplicateKey(key,sliceLabelTokens),row);
877 i->second=((c+1)*i->second + v)/(c+2);
896 onError(ShortLine(key,sliceLabelTokens),row);
900 bytesRead+=buf.size();
902 invalidKeyGotoNextLine:;
906 catch (
const std::bad_alloc&)
908 throw MemoryExhausted();
910 catch (
const std::length_error&)
912 throw MemoryExhausted();
914 catch (
const std::exception& ex)
919 throw std::runtime_error(msg);
924 template <
class P,
class E,
class S>
935 throw runtime_error(
"Duplicate dimension: "+spec.
dimensionNames[i]);
942 auto& tmpData=parseCSV.
tmpData;
944 auto& hc=parseCSV.
hc;
947 auto d=dimLabels.begin();
948 assert(hc.xvectors.size()==dimLabels.size());
949 for (
auto i=hc.xvectors.begin(); i!=hc.xvectors.end();)
952 hc.xvectors.erase(i);
959 assert(hc.xvectors.size()<=dimLabels.size());
961 for (
auto& xv: hc.xvectors)
962 xv.imposeDimension();
965 if (hc.logNumElements()>
log(numeric_limits<size_t>::max()))
966 throw runtime_error(
"Hypercube dimensionality exceeds maximum size, results are likely to be garbage.\n" 967 "Suggest rolling up one or more axes by ignoring them, and setting 'Duplicate Key Action' as appropriate");
969 if (
log(tmpData.size())-hc.logNumElements()>=
log(0.5))
973 throw MemoryExhausted();
982 for (
auto& i: tmpData)
985 assert (hc.rank()<=i.first.size());
986 assert(dimLabels.size()>=hc.rank());
987 int j=hc.rank()-1, k=i.first.size()-1;
990 while (dimLabels[k].size()<2) --k;
991 auto dimLabel=dimLabels[k].find(i.first[k]);
992 assert(dimLabel!=dimLabels[k].end());
993 idx = (idx*dims[j]) + dimLabel->second;
1003 throw MemoryExhausted();
1004 auto dims=hc.dims();
1007 map<size_t,double,less<size_t>,LibCAllocator<pair<const size_t,double>>> indexValue;
1009 for (
auto& i: tmpData)
1012 assert (dims.size()<=i.first.size());
1013 assert(dimLabels.size()>=dims.size());
1014 int j=dims.size()-1, k=i.first.size()-1;
1015 while (j>=0 && k>=0)
1017 while (dimLabels[k].size()<2) --k;
1018 auto dimLabel=dimLabels[k].find(i.first[k]);
1019 assert(dimLabel!=dimLabels[k].end());
1020 idx = (idx*dims[j]) + dimLabel->second;
1024 if (!
isnan(i.second))
1025 indexValue.emplace(idx, i.second);
1026 ++
minsky().progressState;
1030 vv.
index(indexValue);
1032 ++
minsky().progressState;
1035 minsky().progressState.title=
"Cleaning up";
1036 minsky().progressState.displayProgress();
1038 catch (
const std::bad_alloc&)
1040 throw MemoryExhausted();
1042 catch (
const std::length_error&)
1044 throw MemoryExhausted();
1053 loadValueFromCSVFileT<SpaceSeparatorParser>(v,filenames,spec,onError);
1055 loadValueFromCSVFileT<Parser>(v,filenames,spec,onError);
1065 const char*
what() const noexcept
override {
return "Failed to rewind input";}
1077 struct ErrorReporter
1079 Map<size_t> firstRow;
1080 map<size_t,Key> duplicates;
1081 map<size_t,string> invalidData;
1082 void operator()(
const DuplicateKey& ex,
size_t row) {
1083 duplicates.emplace(firstRow[ex.key],ex.key);
1084 duplicates.emplace(row,ex.key);
1086 void operator()(
const InvalidData& ex,
size_t row) {invalidData.emplace(row, ex.msg);}
1087 void operator()(
const ShortLine& ex,
size_t row) {invalidData.emplace(row, ex.msg);}
1089 void rowKeyInsert(
const Key& key,
size_t row) {firstRow.emplace(key,row);}
1101 multimap<Key,string> duplicateLines;
1102 vector<string> invalidDataLines;
1110 if (onError.duplicates.contains(row))
1112 string msg=
"Duplicate key";
1114 duplicateLines.emplace(onError.duplicates[row],msg);
1116 if (onError.invalidData.contains(row))
1118 string msg=onError.invalidData[row];
1120 invalidDataLines.push_back(msg);
1122 bytesRead+=buf.size();
1138 output<<
"Error"<<sep;
1140 bytesRead+=buf.size();
1145 for (
auto& i: invalidDataLines)
1148 for (
auto& i: duplicateLines)
1149 output<<i.second<<endl;
1153 if (!onError.duplicates.contains(row) && !onError.invalidData.contains(row))
1154 output<<sep+buf<<endl;
1155 bytesRead+=buf.size();
1164 reportFromCSVFileT<SpaceSeparatorParser>(
input,output,spec,fileSize);
1166 reportFromCSVFileT<Parser>(
input,output,spec,fileSize);
void loadValueFromCSVFileS(VariableValue &v, S &filenames, const DataSpec &spec)
map< Key, V, less< Key >, LibCAllocator< pair< const Key, V > >> Map
std::size_t nColAxes() const
start column of the data area
size_t operator()(const Any &x) const
void reportFromCSVFile(istream &input, ostream &output, const DataSpec &spec, uintmax_t fileSize)
creates a report CSV file from input, with errors sorted at begining of file, with a column for error...
bool operator()(I &next, I end, std::string &tok)
vector< const string * > tokenRefs
DuplicateKeyAction duplicateKeyAction
boost::tokenizer< Parser > Tokenizer
const Hypercube & hypercube() const override
string stripWSAndDecimalSep(const string &s)
handle reporting errors in loadValueFromCSVFileT when loading files
escapedListSeparator::EscapedListSeparator< char > Parser
size_t firstNumerical(const vector< string > &v)
CLASSDESC_ACCESS_EXPLICIT_INSTANTIATION(minsky::DataSpec)
Tokens< SliceLabelToken > sliceLabelTokens
std::size_t numCols
number of columns in CSV. Must be > dataColOffset
bool operator==(const Any &x) const
double quotedStoD(const string &s, size_t &charsProcd)
T operator[](const string &x)
void do_escape(iterator &next, iterator end, Token &tok)
EscapedListSeparator(Char e='\\', Char c=',', Char q='\"')
civita::Dimension horizontalDimension
void escapeDoubledQuotes(std::string &line, const DataSpec &spec)
replace doubled quotes with escaped quotes
DuplicateKey(const Key &x, const Tokens< SliceLabelToken > &tokens)
std::vector< civita::Dimension > dimensions
const char * what() const noexcept override
bool operator<(const Any &x) const
bool emptyTail(const vector< string > &v, size_t start)
vector< unordered_map< typename Key::value_type, size_t > > dimLabels
bool counter
count data items, not read their values
std::basic_string< Char, Traits > string_type
ParseCSV(istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
bool isNumerical(const std::string &s)
std::string horizontalDimName
bool dontFail
do not throw an error on corrupt data
bool processChunk(std::istream &input, const T &tf, size_t until, U &)
process chunk of input, updating guessed spec
EscapedListSeparator(string_type e, string_type c, string_type q)
Creation and access to the minskyTCL_obj object, which has code to record whenever Minsky's state cha...
void guessRemainder(std::istream &initialInput, std::istream &remainingInput, char separator, uintmax_t fileSize)
figure out the tokenizer function and call givenTFguessRemainder
TensorVal tensorInit
when init is a tensor of values, this overrides the init string
std::string trimWS(const std::string &s)
const char * what() const noexcept override
std::size_t headerRow
number of header rows
SpaceSeparatorParser(char escape='\\', char sep=' ', char quote='"')
void setProgress(double fraction)
Sets the progress to a given fraction of this stack's allocation.
const char * what() const noexcept override
void parse(istream &input, const DataSpec &spec, uintmax_t fileSize, E &onError, bool checkValues=false)
const Index & index(Index &&i) override
std::string str(T x)
utility function to create a string representation of a numeric type
void reportFromCSVFileT(istream &input, ostream &output, const DataSpec &spec, uintmax_t fileSize)
string colName
column name
const Minsky & cminsky()
const version to help in const correctness
string data
data received in field
size_t operator()(const vector< T > &x) const
void loadValueFromCSVFile(VariableValue &v, const vector< string > &filenames, const DataSpec &spec)
load a variableValue from a list of files according to data spec
void loadValueFromCSVFileT(VariableValue &vv, S &stream, const DataSpec &spec, E &onError)
vector< typename Key::value_type > horizontalLabels
std::vector< std::string > dimensionNames
void givenTFguessRemainder(std::istream &initialInput, std::istream &remainingInput, const T &tf, uintmax_t fileSize)
try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char...
ParseCSV(const vector< string > &filenames, const DataSpec &spec, uintmax_t, E &onError, bool checkValues=false)
const string & operator[](T i) const
const char * what() const noexcept override
const char * what() const noexcept override
std::size_t maxColumn
maximum number of columns that can be configured independently. Columns after this limit are treated ...
bool getWholeLine(istream &input, string &line, const DataSpec &spec)
InvalidData(const string &data, const string &type, const string &colName)
std::size_t nRowAxes() const
start row of the data area
bool operator()(InputIterator &next, InputIterator end, Token &tok)
vector< SliceLabelToken, LibCAllocator< SliceLabelToken > > Key
void rowKeyInsert(const Key &, size_t)
update a map of keys to first rows for duplicate key processing
string to_string(CONST84 char *x)
const char * what() const noexcept override
Minsky & minsky()
global minsky object
void guessFromStream(std::istream &file, uintmax_t fileSize=uintmax_t(-1))
initial stab at dataspec from examining stream
void operator()(const E &ex, size_t row)
called on error - ex message to pass on, row - current row
std::set< unsigned > dataCols
Map< double > tmpData
map of data by key
ShortLine(const Key &x, const Tokens< SliceLabelToken > &tokens)
unordered_map< string, T > tokens
static const unsigned numInitialLines
void setDataArea(std::size_t row, std::size_t col)
set top left cell of the data area
void populateFromRavelMetadata(const std::string &metadata, const std::string &horizontalName, std::size_t row)
populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerR...
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored