Minsky
CSVParser.h
Go to the documentation of this file.
1 /*
2  @copyright Steve Keen 2018
3  @author Russell Standish
4  This file is part of Minsky.
5 
6  Minsky is free software: you can redistribute it and/or modify it
7  under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  Minsky is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with Minsky. If not, see <http://www.gnu.org/licenses/>.
18 */
19 #ifndef CSVPARSER_H
20 #define CSVPARSER_H
21 
22 #include "dimension.h"
23 #include "dataSpecSchema.h"
24 #include "classdesc_access.h"
25 #include "str.h"
26 
27 #include <cstddef>
28 #include <string>
29 #include <set>
30 #include <fstream>
31 #include <filesystem>
32 
33 namespace minsky
34 {
35  class VariableValue;
36 
37  class DataSpec: public DataSpecSchema
38  {
39  std::size_t m_nRowAxes=0, m_nColAxes=0;
41  public:
43  std::size_t maxColumn=1000;
44 
46  std::size_t nRowAxes() const {return m_nRowAxes;}
48  std::size_t nColAxes() const {return m_nColAxes;}
49 
50  // handle extra initialisation on conversion
54  return *this;
55  }
56 
58  DataSpecSchema::operator=(x);
60  dimensionCols=x.dimensionCols; // revert clobber by setDataArea
61  dataCols=x.dataCols; // revert clobber by setDataArea
62  return *this;
63  }
64 
65  void toggleDimension(std::size_t c) {
66  auto i=dimensionCols.find(c);
67  if (i==dimensionCols.end())
68  dimensionCols.insert(c);
69  else
70  dimensionCols.erase(i);
71  }
72 
74  void setDataArea(std::size_t row, std::size_t col);
75 
77  void guessFromStream(std::istream& file, uintmax_t fileSize=uintmax_t(-1));
78 
80  void guessFromFile(const std::string& fileName) {
81  std::ifstream is(fileName);
83  guessFromStream(is, std::filesystem::file_size(fileName));
84  }
85 
88  void populateFromRavelMetadata(const std::string& metadata, const std::string& horizontalName, std::size_t row);
89 
92  const std::vector<size_t>& uniqueValues() const {return m_uniqueValues;}
93 
94  private:
97  template <class T>
98  void givenTFguessRemainder(std::istream& initialInput, std::istream& remainingInput, const T& tf, uintmax_t fileSize);
99 
101  void guessRemainder(std::istream& initialInput, std::istream& remainingInput, char separator, uintmax_t fileSize);
102 
103  std::vector<size_t> starts;
104  size_t nCols=0;
105  size_t row=0;
106  size_t firstEmpty=std::numeric_limits<size_t>::max();
107 
109  std::vector<size_t> m_uniqueValues;
110 
113  template <class T, class U>
114  bool processChunk(std::istream& input, const T& tf, size_t until, U&);
115  };
116 
119  void reportFromCSVFile(std::istream& input, std::ostream& output, const DataSpec& spec, uintmax_t fileSize);
120 
122  void loadValueFromCSVFile(VariableValue&,const std::vector<std::string>& filenames,const DataSpec&);
124  void loadValueFromCSVFile(VariableValue&, std::istream& input, const DataSpec&);
125 
127  void escapeDoubledQuotes(std::string&,const DataSpec&);
128 }
129 
130 #include "CSVParser.cd"
131 #include "CSVParser.xcd"
132 #endif
DataSpecSchema toSchema()
Definition: CSVParser.h:51
std::size_t nColAxes() const
start column of the data area
Definition: CSVParser.h:48
void reportFromCSVFile(istream &input, ostream &output, const DataSpec &spec, uintmax_t fileSize)
creates a report CSV file from input, with errors sorted at begining of file, with a column for error...
Definition: CSVParser.cc:1161
void guessFromFile(const std::string &fileName)
initial stab at dataspec from examining file
Definition: CSVParser.h:80
Definition: input.py:1
const std::vector< size_t > & uniqueValues() const
number of unique values in each column corrected for header row, so may be slightly inaccurate if hea...
Definition: CSVParser.h:92
size_t firstEmpty
Definition: CSVParser.h:106
void escapeDoubledQuotes(std::string &line, const DataSpec &spec)
replace doubled quotes with escaped quotes
Definition: CSVParser.cc:605
bool processChunk(std::istream &input, const T &tf, size_t until, U &)
process chunk of input, updating guessed spec
Creation and access to the minskyTCL_obj object, which has code to record whenever Minsky&#39;s state cha...
Definition: constMap.h:22
void guessRemainder(std::istream &initialInput, std::istream &remainingInput, char separator, uintmax_t fileSize)
figure out the tokenizer function and call givenTFguessRemainder
Definition: CSVParser.cc:431
CLASSDESC_ACCESS(DataSpec)
std::size_t m_nColAxes
Definition: CSVParser.h:39
void loadValueFromCSVFile(VariableValue &v, const vector< string > &filenames, const DataSpec &spec)
load a variableValue from a list of files according to data spec
Definition: CSVParser.cc:1058
void givenTFguessRemainder(std::istream &initialInput, std::istream &remainingInput, const T &tf, uintmax_t fileSize)
try to fill in remainder of spec, given a tokenizer function tf eg boost::escaped_list_separator<char...
std::vector< size_t > starts
Definition: CSVParser.h:103
std::size_t maxColumn
maximum number of columns that can be configured independently. Columns after this limit are treated ...
Definition: CSVParser.h:43
void toggleDimension(std::size_t c)
Definition: CSVParser.h:65
std::size_t nRowAxes() const
start row of the data area
Definition: CSVParser.h:46
DataSpec & operator=(const DataSpecSchema &x)
Definition: CSVParser.h:57
void stripByteOrderingMarker(std::istream &s)
checks if the input stream has the UTF-8 byte ordering marker, and removes it if present ...
Definition: str.h:147
void guessFromStream(std::istream &file, uintmax_t fileSize=uintmax_t(-1))
initial stab at dataspec from examining stream
Definition: CSVParser.cc:517
std::set< unsigned > dataCols
std::size_t m_nRowAxes
Definition: CSVParser.h:39
std::vector< size_t > m_uniqueValues
number of unique values in each column
Definition: CSVParser.h:109
void setDataArea(std::size_t row, std::size_t col)
set top left cell of the data area
Definition: CSVParser.cc:370
void populateFromRavelMetadata(const std::string &metadata, const std::string &horizontalName, std::size_t row)
populates this spec from a "RavelHypercube" entry, row is the row being read, used to set the headerR...
Definition: CSVParser.cc:551
std::set< unsigned > dimensionCols
rows and columns that are comment lines to be ignored