Minsky: 3.17.0
CSVDialog.cc
Go to the documentation of this file.
1 /*
2  @copyright Steve Keen 2018
3  @author Russell Standish
4  This file is part of Minsky.
5 
6  Minsky is free software: you can redistribute it and/or modify it
7  under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  Minsky is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with Minsky. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include "cairoItems.h"
21 #include "CSVDialog.h"
22 #include "group.h"
23 #include "selection.h"
24 #include "lasso.h"
25 #include <pango.h>
26 
27 #include "CSVDialog.rcd"
28 #include "minsky_epilogue.h"
29 
30 #include "zStream.h"
31 #include "dimension.h"
32 #include <cstdlib>
33 #include <chrono>
34 #include <iostream>
35 #include <string>
36 #include <stdexcept>
37 #include <sstream>
38 #include <regex>
39 
40 using namespace std;
41 using namespace minsky;
42 using namespace civita;
43 using ecolab::Pango;
44 using ecolab::cairo::CairoSave;
45 
46 #include <boost/filesystem.hpp>
47 using boost::filesystem::file_size;
48 
49 const unsigned CSVDialog::numInitialLines;
50 
51 void CSVDialog::reportFromFile(const std::string& input, const std::string& output) const
52 {
53  ifstream is(input);
55  ofstream of(output);
56  reportFromCSVFile(is,of,spec,file_size(input));
57 }
58 
59 namespace
60 {
61  // manage temporary files
62  struct CacheEntry
63  {
64  chrono::time_point<chrono::system_clock> timestamp;
65  string url, filename;
66  CacheEntry(const string& url): timestamp(chrono::system_clock::now()), url(url),
67  filename(boost::filesystem::unique_path().string()) {}
69  bool operator<(const CacheEntry& x) const {return url<x.url;}
70  };
71 
72  // note: this cache will leak disk storage if Minsky is killed, not shut down cleanly
73  struct Cache: private set<CacheEntry>
74  {
75  using set<CacheEntry>::find;
76  using set<CacheEntry>::end;
77  using set<CacheEntry>::erase;
78  iterator emplace(const string& url)
79  {
80  if (size()>=10)
81  {
82  // find oldest element and erase
83  auto entryToErase=begin();
84  auto ts=entryToErase->timestamp;
85  for (auto i=begin(); i!=end(); ++i)
86  if (i->timestamp<ts)
87  {
88  ts=i->timestamp;
89  entryToErase=i;
90  }
91  erase(entryToErase);
92  }
93  return set<CacheEntry>::emplace(url).first;
94  }
95  };
96 }
97 
98 void CSVDialog::loadFile()
99 {
100  loadFileFromName(url);
101 }
102 
103 void CSVDialog::guessSpecAndLoadFile()
104 {
105  spec=DataSpec();
106  spec.guessFromFile(url);
107  loadFileFromName(url);
108  populateHeaders();
109  classifyColumns();
110 }
111 
112 void CSVDialog::loadFileFromName(const std::string& fname)
113 {
114  ifstream is(fname);
116  initialLines.clear();
117  for (size_t i=0; i<numInitialLines && is; ++i)
118  {
119  initialLines.emplace_back();
120  getline(is, initialLines.back());
121  // chomp any final '\r' character (DOS files)
122  if (!initialLines.back().empty() && initialLines.back().back()=='\r')
123  initialLines.back().erase(initialLines.back().end()-1);
124  }
125  // Ensure dimensions.size() is the same as nColAxes() upon first load of a CSV file. For ticket 974.
126  if (spec.dimensions.size()<spec.nColAxes()) spec.setDataArea(spec.nRowAxes(),spec.nColAxes());
127 }
128 
129 template <class Parser>
130 vector<vector<string>> parseLines(const Parser& parser, const vector<string>& lines, size_t maxColumn)
131 {
132  vector<vector<string>> r;
133  for (const auto& line: lines)
134  {
135  r.emplace_back();
136  try
137  {
138  const boost::tokenizer<Parser> tok(line.begin(), line.end(), parser);
139  auto t=tok.begin();
140  for (size_t i=0; i<maxColumn && t!=tok.end(); ++i, ++t)
141  r.back().push_back(*t);
142  }
143  catch (...) // if not parseable, place entire line in first cell
144  {
145  r.back().push_back(line);
146  }
147  }
148  return r;
149 }
150 
151 namespace
152 {
153  struct CroppedPango: public Pango
154  {
155  cairo_t* cairo;
156  double w, x=0, y=0;
157  CroppedPango(cairo_t* cairo, double width): Pango(cairo), cairo(cairo), w(width) {}
158  void setxy(double xx, double yy) {x=xx; y=yy;}
159  void show() {
160  const CairoSave cs(cairo);
161  cairo_rectangle(cairo,x,y,w,height());
162  cairo_clip(cairo);
163  cairo_move_to(cairo,x,y);
164  Pango::show();
165  }
166  };
167 }
168 
169 bool CSVDialog::redraw(int, int, int, int)
170 {
171  cairo_t* cairo=surface->cairo();
172  rowHeight=15;
173  vector<vector<string>> parsedLines=parseLines();
174 
175  // LHS row labels
176  {
177  Pango pango(cairo);
178  pango.setText("Dimension");
179  cairo_move_to(cairo,xoffs-pango.width()-5,0);
180  pango.show();
181  pango.setText("Type");
182  cairo_move_to(cairo,xoffs-pango.width()-5,rowHeight);
183  pango.show();
184  pango.setText("Format");
185  cairo_move_to(cairo,xoffs-pango.width()-5,2*rowHeight);
186  pango.show();
187  if (flashNameRow)
188  pango.setMarkup("<b>Name</b>");
189  else
190  pango.setText("Name");
191  cairo_move_to(cairo,xoffs-pango.width()-5,3*rowHeight);
192  pango.show();
193  pango.setText("Header");
194  cairo_move_to(cairo,xoffs-pango.width()-5,(4+spec.headerRow)*rowHeight);
195  pango.show();
196 
197  }
198 
199  CroppedPango pango(cairo, colWidth);
200  pango.setFontSize(0.8*rowHeight);
201 
202  set<size_t> done;
203  double x=xoffs, y=0;
204  size_t col=0;
205  for (; done.size()<parsedLines.size(); ++col)
206  {
207  if (col<spec.nColAxes())
208  {// dimension check boxes
209  const CairoSave cs(cairo);
210  const double cbsz=5;
211  cairo_set_line_width(cairo,1);
212  cairo_translate(cairo,x+0.5*colWidth,y+0.5*rowHeight);
213  cairo_rectangle(cairo,-cbsz,-cbsz,2*cbsz,2*cbsz);
214  if (spec.dimensionCols.contains(col))
215  {
216  cairo_move_to(cairo,-cbsz,-cbsz);
217  cairo_line_to(cairo,cbsz,cbsz);
218  cairo_move_to(cairo,cbsz,-cbsz);
219  cairo_line_to(cairo,-cbsz,cbsz);
220  }
221  cairo_stroke(cairo);
222  }
223  y+=rowHeight;
224  // type
225  if (spec.dimensionCols.contains(col) && col<spec.dimensions.size() && col<spec.nColAxes())
226  {
227  pango.setText(classdesc::enumKey<Dimension::Type>(spec.dimensions[col].type));
228  pango.setxy(x,y);
229  pango.show();
230  }
231  y+=rowHeight;
232  if (spec.dimensionCols.contains(col) && col<spec.dimensions.size() && col<spec.nColAxes())
233  {
234  pango.setText(spec.dimensions[col].units);
235  pango.setxy(x,y);
236  pango.show();
237  }
238  y+=rowHeight;
239  if (spec.dimensionCols.contains(col) && col<spec.dimensionNames.size() && col<spec.nColAxes())
240  {
241  pango.setText(spec.dimensionNames[col]);
242  pango.setxy(x,y);
243  pango.show();
244  }
245  y+=rowHeight;
246  for (size_t row=0; row<parsedLines.size(); ++row)
247  {
248  auto& line=parsedLines[row];
249  if (col<line.size())
250  {
251  const CairoSave cs(cairo);
252  pango.setText(line[col]);
253  pango.setxy(x, y);
254  if (row==spec.headerRow)
255  if (col<spec.nColAxes())
256  cairo_set_source_rgb(surface->cairo(),0,0.7,0);
257  else
258  cairo_set_source_rgb(surface->cairo(),0,0,1);
259  else if (row<spec.nRowAxes() || (col<spec.nColAxes() && !spec.dimensionCols.contains(col)))
260  cairo_set_source_rgb(surface->cairo(),1,0,0);
261  else if (col<spec.nColAxes())
262  cairo_set_source_rgb(surface->cairo(),0,0,1);
263  pango.show();
264  }
265  else
266  done.insert(row);
267  y+=rowHeight;
268  }
269  {
270  const CairoSave cs(cairo);
271  cairo_set_source_rgb(cairo,.5,.5,.5);
272  cairo_move_to(cairo,x-2.5,0);
273  cairo_rel_line_to(cairo,0,(parsedLines.size()+4)*rowHeight);
274  cairo_stroke(cairo);
275  }
276  x+=colWidth+5;
277  y=0;
278  }
279  m_tableWidth=(col-1)*(colWidth+5);
280  for (size_t row=0; row<parsedLines.size()+5; ++row)
281  {
282  const CairoSave cs(cairo);
283  cairo_set_source_rgb(cairo,.5,.5,.5);
284  cairo_move_to(cairo,xoffs-2.5,row*rowHeight);
285  cairo_rel_line_to(cairo,m_tableWidth,0);
286  cairo_stroke(cairo);
287  }
288  return true;
289 }
290 
291 size_t CSVDialog::columnOver(double x) const
292 {
293  return size_t((x-xoffs)/(colWidth+5));
294 }
295 
296 size_t CSVDialog::rowOver(double y) const
297 {
298  return size_t(y/rowHeight);
299 }
300 
301 std::vector<std::vector<std::string>> CSVDialog::parseLines(size_t maxColumn)
302 {
303  vector<vector<string>> parsedLines;
304  if (spec.mergeDelimiters)
305  if (spec.separator==' ')
306  parsedLines=::parseLines(boost::char_separator<char>(), initialLines, maxColumn);
307  else
308  {
309  char separators[]={spec.separator,'\0'};
310  parsedLines=::parseLines
311  (boost::char_separator<char>(separators,""),initialLines, maxColumn);
312  }
313  else
314  parsedLines=::parseLines
315  (boost::escaped_list_separator<char>(spec.escape,spec.separator,spec.quote),
316  initialLines, maxColumn);
317 
318  // update numCols iff maxColumn unrestricted
319  if (maxColumn==numeric_limits<size_t>::max())
320  {
321  spec.numCols=0;
322  for (auto& i: parsedLines)
323  spec.numCols=std::max(spec.numCols, i.size());
324  }
325  return parsedLines;
326 }
327 
328 void CSVDialog::populateHeaders()
329 {
330  auto parsedLines=parseLines();
331  if (spec.headerRow>=parsedLines.size()) return;
332  auto& hr=parsedLines[spec.headerRow];
333  spec.dimensionNames={hr.begin(), min(hr.end(), hr.begin()+spec.maxColumn)};
334 }
335 
336 void CSVDialog::populateHeader(size_t col)
337 {
338  auto parsedLines=parseLines();
339  if (spec.headerRow>=parsedLines.size()) return;
340  auto& headers=parsedLines[spec.headerRow];
341  if (col<headers.size() && col<spec.maxColumn)
342  spec.dimensionNames[col]=headers[col];
343 }
344 
345 void CSVDialog::classifyColumns()
346 {
347  auto parsedLines=parseLines();
348  spec.dimensionCols.clear();
349  spec.dataCols.clear();
350  spec.dimensions.resize(min(spec.numCols,spec.maxColumn));
351  for (size_t col=0; col<spec.numCols; ++col)
352  {
353  bool entryFound=false, timeFound=true, numberFound=true;
354  for (size_t row=spec.nRowAxes(); row<parsedLines.size(); ++row)
355  if (col<parsedLines[row].size() && !parsedLines[row][col].empty())
356  {
357  entryFound=true;
358  if (numberFound && !isNumerical(parsedLines[row][col]))
359  numberFound=false;
360  static const AnyVal any(Dimension(Dimension::time,""));
361  if (timeFound)
362  try
363  {any(parsedLines[row][col]);}
364  catch (...)
365  {timeFound=false;}
366  }
367  if (entryFound && col<spec.maxColumn)
368  {
369  if (numberFound)
370  spec.dataCols.insert(col);
371  else
372  {
373  spec.dimensionCols.insert(col);
374  if (timeFound)
375  spec.dimensions[col].type=Dimension::time;
376  else
377  spec.dimensions[col].type=Dimension::string;
378  spec.dimensions[col].units.clear();
379  }
380  }
381  else if (col>=spec.nColAxes() && col<spec.maxColumn)
382  spec.dataCols.insert(col);
383  }
384 }
385 
386 std::vector<size_t> CSVDialog::correctedUniqueValues()
387 {
388  auto r=spec.uniqueValues();
389  // apply a correction by removing the values in the header rows
390  vector<set<size_t>> correction(r.size());
391  auto parsedLines=parseLines();
392  const hash<string> h;
393  for (size_t row=0; row<parsedLines.size() && row<spec.nRowAxes(); ++row)
394  for (size_t col=0; col<correction.size() && col<parsedLines[row].size(); ++col)
395  correction[col].insert(h(parsedLines[row][col]));
396  for (size_t i=0; i<r.size(); ++i)
397  if (r[i]>correction[i].size())
398  r[i]-=correction[i].size();
399  else
400  r[i]=1;
401  return r;
402 }
403 
void reportFromCSVFile(istream &input, ostream &output, const DataSpec &spec, uintmax_t fileSize)
creates a report CSV file from input, with errors sorted at begining of file, with a column for error...
Definition: CSVParser.cc:1161
Definition: input.py:1
iterator emplace(const string &url)
Definition: CSVDialog.cc:78
STL namespace.
CroppedPango(cairo_t *cairo, double width)
Definition: CSVDialog.cc:157
bool isNumerical(const std::string &s)
Definition: CSVParser.cc:338
void remove(std::vector< T > &x, const V &v)
remove an element from a vector. V must be comparable to a T
Definition: str.h:89
CLASSDESC_ACCESS_EXPLICIT_INSTANTIATION(minsky::CSVDialog)
chrono::time_point< chrono::system_clock > timestamp
Definition: CSVDialog.cc:64
vector< vector< string > > parseLines(const Parser &parser, const vector< string > &lines, size_t maxColumn)
Definition: CSVDialog.cc:130
bool operator<(const CacheEntry &x) const
Definition: CSVDialog.cc:69
void stripByteOrderingMarker(std::istream &s)
checks if the input stream has the UTF-8 byte ordering marker, and removes it if present ...
Definition: str.h:147