Minsky
CSVDialog.cc
Go to the documentation of this file.
1 /*
2  @copyright Steve Keen 2018
3  @author Russell Standish
4  This file is part of Minsky.
5 
6  Minsky is free software: you can redistribute it and/or modify it
7  under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  Minsky is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with Minsky. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include "cairoItems.h"
21 #include "CSVDialog.h"
22 #include "group.h"
23 #include "selection.h"
24 #include "lasso.h"
25 #include <pango.h>
26 
27 #include "CSVDialog.rcd"
28 #include "minsky_epilogue.h"
29 #include "zStream.h"
30 #include "dimension.h"
31 #include <cstdlib>
32 #include <chrono>
33 #include <iostream>
34 #include <string>
35 #include <stdexcept>
36 #include <sstream>
37 #include <regex>
38 
39 using namespace std;
40 using namespace minsky;
41 using namespace civita;
42 using ecolab::Pango;
43 using ecolab::cairo::CairoSave;
44 
45 #include <boost/filesystem.hpp>
46 using boost::filesystem::file_size;
47 
48 const unsigned CSVDialog::numInitialLines;
49 
50 void CSVDialog::reportFromFile(const std::string& input, const std::string& output) const
51 {
52  ifstream is(input);
54  ofstream of(output);
55  reportFromCSVFile(is,of,spec,file_size(input));
56 }
57 
58 namespace
59 {
60  // manage temporary files
61  struct CacheEntry
62  {
63  chrono::time_point<chrono::system_clock> timestamp;
64  string url, filename;
65  CacheEntry(const string& url): timestamp(chrono::system_clock::now()), url(url),
66  filename(boost::filesystem::unique_path().string()) {}
68  bool operator<(const CacheEntry& x) const {return url<x.url;}
69  };
70 
71  // note: this cache will leak disk storage if Minsky is killed, not shut down cleanly
72  struct Cache: private set<CacheEntry>
73  {
74  using set<CacheEntry>::find;
75  using set<CacheEntry>::end;
76  using set<CacheEntry>::erase;
77  iterator emplace(const string& url)
78  {
79  if (size()>=10)
80  {
81  // find oldest element and erase
82  auto entryToErase=begin();
83  auto ts=entryToErase->timestamp;
84  for (auto i=begin(); i!=end(); ++i)
85  if (i->timestamp<ts)
86  {
87  ts=i->timestamp;
88  entryToErase=i;
89  }
90  erase(entryToErase);
91  }
92  return set<CacheEntry>::emplace(url).first;
93  }
94  };
95 }
96 
97 void CSVDialog::loadFile()
98 {
99  loadFileFromName(url);
100 }
101 
102 void CSVDialog::guessSpecAndLoadFile()
103 {
104  spec=DataSpec();
105  spec.guessFromFile(url);
106  loadFileFromName(url);
107  populateHeaders();
108  classifyColumns();
109 }
110 
111 void CSVDialog::loadFileFromName(const std::string& fname)
112 {
113  ifstream is(fname);
115  initialLines.clear();
116  for (size_t i=0; i<numInitialLines && is; ++i)
117  {
118  initialLines.emplace_back();
119  getline(is, initialLines.back());
120  // chomp any final '\r' character (DOS files)
121  if (!initialLines.back().empty() && initialLines.back().back()=='\r')
122  initialLines.back().erase(initialLines.back().end()-1);
123  }
124  // Ensure dimensions.size() is the same as nColAxes() upon first load of a CSV file. For ticket 974.
125  if (spec.dimensions.size()<spec.nColAxes()) spec.setDataArea(spec.nRowAxes(),spec.nColAxes());
126 }
127 
128 template <class Parser>
129 vector<vector<string>> parseLines(const Parser& parser, const vector<string>& lines, size_t maxColumn)
130 {
131  vector<vector<string>> r;
132  for (const auto& line: lines)
133  {
134  r.emplace_back();
135  try
136  {
137  const boost::tokenizer<Parser> tok(line.begin(), line.end(), parser);
138  auto t=tok.begin();
139  for (size_t i=0; i<maxColumn && t!=tok.end(); ++i, ++t)
140  r.back().push_back(*t);
141  }
142  catch (...) // if not parseable, place entire line in first cell
143  {
144  r.back().push_back(line);
145  }
146  }
147  return r;
148 }
149 
150 namespace
151 {
152  struct CroppedPango: public Pango
153  {
154  cairo_t* cairo;
155  double w, x=0, y=0;
156  CroppedPango(cairo_t* cairo, double width): Pango(cairo), cairo(cairo), w(width) {}
157  void setxy(double xx, double yy) {x=xx; y=yy;}
158  void show() {
159  const CairoSave cs(cairo);
160  cairo_rectangle(cairo,x,y,w,height());
161  cairo_clip(cairo);
162  cairo_move_to(cairo,x,y);
163  Pango::show();
164  }
165  };
166 }
167 
168 bool CSVDialog::redraw(int, int, int, int)
169 {
170  cairo_t* cairo=surface->cairo();
171  rowHeight=15;
172  vector<vector<string>> parsedLines=parseLines();
173 
174  // LHS row labels
175  {
176  Pango pango(cairo);
177  pango.setText("Dimension");
178  cairo_move_to(cairo,xoffs-pango.width()-5,0);
179  pango.show();
180  pango.setText("Type");
181  cairo_move_to(cairo,xoffs-pango.width()-5,rowHeight);
182  pango.show();
183  pango.setText("Format");
184  cairo_move_to(cairo,xoffs-pango.width()-5,2*rowHeight);
185  pango.show();
186  if (flashNameRow)
187  pango.setMarkup("<b>Name</b>");
188  else
189  pango.setText("Name");
190  cairo_move_to(cairo,xoffs-pango.width()-5,3*rowHeight);
191  pango.show();
192  pango.setText("Header");
193  cairo_move_to(cairo,xoffs-pango.width()-5,(4+spec.headerRow)*rowHeight);
194  pango.show();
195 
196  }
197 
198  CroppedPango pango(cairo, colWidth);
199  pango.setFontSize(0.8*rowHeight);
200 
201  set<size_t> done;
202  double x=xoffs, y=0;
203  size_t col=0;
204  for (; done.size()<parsedLines.size(); ++col)
205  {
206  if (col<spec.nColAxes())
207  {// dimension check boxes
208  const CairoSave cs(cairo);
209  const double cbsz=5;
210  cairo_set_line_width(cairo,1);
211  cairo_translate(cairo,x+0.5*colWidth,y+0.5*rowHeight);
212  cairo_rectangle(cairo,-cbsz,-cbsz,2*cbsz,2*cbsz);
213  if (spec.dimensionCols.contains(col))
214  {
215  cairo_move_to(cairo,-cbsz,-cbsz);
216  cairo_line_to(cairo,cbsz,cbsz);
217  cairo_move_to(cairo,cbsz,-cbsz);
218  cairo_line_to(cairo,-cbsz,cbsz);
219  }
220  cairo_stroke(cairo);
221  }
222  y+=rowHeight;
223  // type
224  if (spec.dimensionCols.contains(col) && col<spec.dimensions.size() && col<spec.nColAxes())
225  {
226  pango.setText(classdesc::enumKey<Dimension::Type>(spec.dimensions[col].type));
227  pango.setxy(x,y);
228  pango.show();
229  }
230  y+=rowHeight;
231  if (spec.dimensionCols.contains(col) && col<spec.dimensions.size() && col<spec.nColAxes())
232  {
233  pango.setText(spec.dimensions[col].units);
234  pango.setxy(x,y);
235  pango.show();
236  }
237  y+=rowHeight;
238  if (spec.dimensionCols.contains(col) && col<spec.dimensionNames.size() && col<spec.nColAxes())
239  {
240  pango.setText(spec.dimensionNames[col]);
241  pango.setxy(x,y);
242  pango.show();
243  }
244  y+=rowHeight;
245  for (size_t row=0; row<parsedLines.size(); ++row)
246  {
247  auto& line=parsedLines[row];
248  if (col<line.size())
249  {
250  const CairoSave cs(cairo);
251  pango.setText(line[col]);
252  pango.setxy(x, y);
253  if (row==spec.headerRow)
254  if (col<spec.nColAxes())
255  cairo_set_source_rgb(surface->cairo(),0,0.7,0);
256  else
257  cairo_set_source_rgb(surface->cairo(),0,0,1);
258  else if (row<spec.nRowAxes() || (col<spec.nColAxes() && !spec.dimensionCols.contains(col)))
259  cairo_set_source_rgb(surface->cairo(),1,0,0);
260  else if (col<spec.nColAxes())
261  cairo_set_source_rgb(surface->cairo(),0,0,1);
262  pango.show();
263  }
264  else
265  done.insert(row);
266  y+=rowHeight;
267  }
268  {
269  const CairoSave cs(cairo);
270  cairo_set_source_rgb(cairo,.5,.5,.5);
271  cairo_move_to(cairo,x-2.5,0);
272  cairo_rel_line_to(cairo,0,(parsedLines.size()+4)*rowHeight);
273  cairo_stroke(cairo);
274  }
275  x+=colWidth+5;
276  y=0;
277  }
278  m_tableWidth=(col-1)*(colWidth+5);
279  for (size_t row=0; row<parsedLines.size()+5; ++row)
280  {
281  const CairoSave cs(cairo);
282  cairo_set_source_rgb(cairo,.5,.5,.5);
283  cairo_move_to(cairo,xoffs-2.5,row*rowHeight);
284  cairo_rel_line_to(cairo,m_tableWidth,0);
285  cairo_stroke(cairo);
286  }
287  return true;
288 }
289 
290 size_t CSVDialog::columnOver(double x) const
291 {
292  return size_t((x-xoffs)/(colWidth+5));
293 }
294 
295 size_t CSVDialog::rowOver(double y) const
296 {
297  return size_t(y/rowHeight);
298 }
299 
300 std::vector<std::vector<std::string>> CSVDialog::parseLines(size_t maxColumn)
301 {
302  vector<vector<string>> parsedLines;
303  if (spec.mergeDelimiters)
304  if (spec.separator==' ')
305  parsedLines=::parseLines(boost::char_separator<char>(), initialLines, maxColumn);
306  else
307  {
308  char separators[]={spec.separator,'\0'};
309  parsedLines=::parseLines
310  (boost::char_separator<char>(separators,""),initialLines, maxColumn);
311  }
312  else
313  parsedLines=::parseLines
314  (boost::escaped_list_separator<char>(spec.escape,spec.separator,spec.quote),
315  initialLines, maxColumn);
316 
317  // update numCols iff maxColumn unrestricted
318  if (maxColumn==numeric_limits<size_t>::max())
319  {
320  spec.numCols=0;
321  for (auto& i: parsedLines)
322  spec.numCols=std::max(spec.numCols, i.size());
323  }
324  return parsedLines;
325 }
326 
327 void CSVDialog::populateHeaders()
328 {
329  auto parsedLines=parseLines();
330  if (spec.headerRow>=parsedLines.size()) return;
331  auto& hr=parsedLines[spec.headerRow];
332  spec.dimensionNames={hr.begin(), min(hr.end(), hr.begin()+spec.maxColumn)};
333 }
334 
335 void CSVDialog::populateHeader(size_t col)
336 {
337  auto parsedLines=parseLines();
338  if (spec.headerRow>=parsedLines.size()) return;
339  auto& headers=parsedLines[spec.headerRow];
340  if (col<headers.size() && col<spec.maxColumn)
341  spec.dimensionNames[col]=headers[col];
342 }
343 
344 void CSVDialog::classifyColumns()
345 {
346  auto parsedLines=parseLines();
347  spec.dimensionCols.clear();
348  spec.dataCols.clear();
349  spec.dimensions.resize(min(spec.numCols,spec.maxColumn));
350  for (size_t col=0; col<spec.numCols; ++col)
351  {
352  bool entryFound=false, timeFound=true, numberFound=true;
353  for (size_t row=spec.nRowAxes(); row<parsedLines.size(); ++row)
354  if (col<parsedLines[row].size() && !parsedLines[row][col].empty())
355  {
356  entryFound=true;
357  if (numberFound && !isNumerical(parsedLines[row][col]))
358  numberFound=false;
359  static const AnyVal any(Dimension(Dimension::time,""));
360  if (timeFound)
361  try
362  {any(parsedLines[row][col]);}
363  catch (...)
364  {timeFound=false;}
365  }
366  if (entryFound && col<spec.maxColumn)
367  {
368  if (numberFound)
369  spec.dataCols.insert(col);
370  else
371  {
372  spec.dimensionCols.insert(col);
373  if (timeFound)
374  spec.dimensions[col].type=Dimension::time;
375  else
376  spec.dimensions[col].type=Dimension::string;
377  spec.dimensions[col].units.clear();
378  }
379  }
380  else if (col>=spec.nColAxes() && col<spec.maxColumn)
381  spec.dataCols.insert(col);
382  }
383 }
384 
385 std::vector<size_t> CSVDialog::correctedUniqueValues()
386 {
387  auto r=spec.uniqueValues();
388  // apply a correction by removing the values in the header rows
389  vector<set<size_t>> correction(r.size());
390  auto parsedLines=parseLines();
391  const hash<string> h;
392  for (size_t row=0; row<parsedLines.size() && row<spec.nRowAxes(); ++row)
393  for (size_t col=0; col<correction.size() && col<parsedLines[row].size(); ++col)
394  correction[col].insert(h(parsedLines[row][col]));
395  for (size_t i=0; i<r.size(); ++i)
396  if (r[i]>correction[i].size())
397  r[i]-=correction[i].size();
398  else
399  r[i]=1;
400  return r;
401 }
402 
void reportFromCSVFile(istream &input, ostream &output, const DataSpec &spec, uintmax_t fileSize)
creates a report CSV file from input, with errors sorted at begining of file, with a column for error...
Definition: CSVParser.cc:1161
Definition: input.py:1
iterator emplace(const string &url)
Definition: CSVDialog.cc:77
STL namespace.
CroppedPango(cairo_t *cairo, double width)
Definition: CSVDialog.cc:156
bool isNumerical(const std::string &s)
Definition: CSVParser.cc:338
void remove(std::vector< T > &x, const V &v)
remove an element from a vector. V must be comparable to a T
Definition: str.h:89
CLASSDESC_ACCESS_EXPLICIT_INSTANTIATION(minsky::CSVDialog)
Creation and access to the minskyTCL_obj object, which has code to record whenever Minsky&#39;s state cha...
Definition: constMap.h:22
chrono::time_point< chrono::system_clock > timestamp
Definition: CSVDialog.cc:63
vector< vector< string > > parseLines(const Parser &parser, const vector< string > &lines, size_t maxColumn)
Definition: CSVDialog.cc:129
bool operator<(const CacheEntry &x) const
Definition: CSVDialog.cc:68
void stripByteOrderingMarker(std::istream &s)
checks if the input stream has the UTF-8 byte ordering marker, and removes it if present ...
Definition: str.h:147