FOSSology  4.4.0
Open Source License Compliance by Open Source Software
copyrightUtils.cc
Go to the documentation of this file.
1 /*
2  SPDX-FileCopyrightText: © 2014-2018,2022, Siemens AG
3  Author: Daniele Fognini, Johannes Najjar
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
12 #include "copyrightUtils.hpp"
13 #include <boost/program_options.hpp>
14 
15 #include <iostream>
16 #include <sstream>
17 
18 using namespace std;
19 
25 int queryAgentId(PGconn* dbConn)
26 {
27  char* COMMIT_HASH = fo_sysconfig(AGENT_NAME, "COMMIT_HASH");
28  char* VERSION = fo_sysconfig(AGENT_NAME, "VERSION");
29  char* agentRevision;
30  if (!asprintf(&agentRevision, "%s.%s", VERSION, COMMIT_HASH))
31  {
32  exit(-1);
33  };
34 
35  int agentId = fo_GetAgentKey(dbConn,
36  AGENT_NAME, 0, agentRevision, AGENT_DESC);
37  free(agentRevision);
38 
39  if (agentId > 0)
40  {
41  return agentId;
42  }
43  else
44  {
45  exit(1);
46  }
47 }
48 
53 int writeARS(int agentId, int arsId, int uploadId, int success, const fo::DbManager& dbManager)
54 {
55  return fo_WriteARS(dbManager.getConnection(), arsId, uploadId, agentId, AGENT_ARS, NULL, success);
56 }
57 
62 void bail(int exitval)
63 {
64  fo_scheduler_disconnect(exitval);
65  exit(exitval);
66 }
67 
78 bool parseCliOptions(int argc, char** argv, CliOptions& dest,
79  std::vector<std::string>& fileNames, std::string& directoryToScan)
80 {
81  unsigned type = 0;
82 
83  boost::program_options::options_description desc(IDENTITY ": recognized options");
84  desc.add_options()
85  ("help,h", "shows help")
86  (
87  "type,T",
88  boost::program_options::value<unsigned>(&type)
89  ->default_value(ALL_TYPES),
90  "type of regex to try"
91  ) // TODO change and add help based on IDENTITY
92  (
93  "verbose,v", "increase verbosity"
94  )
95  (
96  "regex",
97  boost::program_options::value<vector<string> >(),
98  "user defined Regex to search: [{name=cli}@@][{matchingGroup=0}@@]{regex} e.g. 'linux@@1@@(linus) torvalds'"
99  )
100  (
101  "files",
102  boost::program_options::value< vector<string> >(),
103  "files to scan"
104  )
105  (
106  "json,J", "output JSON"
107  )
108  (
109  "ignoreFilesWithMimeType,I", "ignoreFilesWithMimeType"
110  )
111  (
112  "config,c", boost::program_options::value<string>(), "path to the sysconfigdir"
113  )
114  (
115  "scheduler_start", "specifies, that the command was called by the scheduler"
116  )
117  (
118  "userID", boost::program_options::value<int>(), "the id of the user that created the job (only in combination with --scheduler_start)"
119  )
120  (
121  "groupID", boost::program_options::value<int>(), "the id of the group of the user that created the job (only in combination with --scheduler_start)"
122  )
123  (
124  "jobId", boost::program_options::value<int>(), "the id of the job (only in combination with --scheduler_start)"
125  )
126  (
127  "directory,d", boost::program_options::value<string>(), "directory to scan (recursive)"
128  )
129  ;
130 
131  boost::program_options::positional_options_description p;
132  p.add("files", -1);
133 
134  boost::program_options::variables_map vm;
135 
136  try
137  {
138  boost::program_options::store(
139  boost::program_options::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
140 
141  type = vm["type"].as<unsigned>();
142 
143  if ((vm.count("help") > 0) || (type > ALL_TYPES))
144  {
145  cout << desc << endl;
146  exit(0);
147  }
148 
149  if (vm.count("files"))
150  {
151  fileNames = vm["files"].as<std::vector<string> >();
152  }
153 
154  unsigned long verbosity = vm.count("verbose");
155  bool json = vm.count("json") > 0 ? true : false;
156  bool ignoreFilesWithMimeType = vm.count("ignoreFilesWithMimeType") > 0 ? true : false;
157 
158  dest = CliOptions(verbosity, type, json, ignoreFilesWithMimeType);
159 
160  if (vm.count("regex"))
161  {
162  const std::vector<std::string>& userRegexesFmts = vm["regex"].as<vector<std::string> >();
163  for (auto it = userRegexesFmts.begin(); it != userRegexesFmts.end(); ++it) {
164  scanner* sc = makeRegexScanner(*it, "cli");
165  if (!(sc))
166  {
167  cout << "cannot parse regex format : " << *it << endl;
168  return false;
169  }
170  else
171  {
172  dest.addScanner(sc);
173  }
174  }
175  }
176 
177  if (vm.count("directory"))
178  {
179  if (vm.count("files"))
180  {
181  cout << "cannot pass files and directory at the same time" << endl;
182  cout << desc << endl;
183  fileNames.clear();
184  return false;
185  }
186  directoryToScan = vm["directory"].as<std::string>();
187  }
188 
189  return true;
190  }
191  catch (boost::bad_any_cast&) {
192  cout << "wrong parameter type" << endl;
193  cout << desc << endl;
194  return false;
195  }
196  catch (boost::program_options::error&)
197  {
198  cout << "wrong command line arguments" << endl;
199  cout << desc << endl;
200  return false;
201  }
202 }
203 
209 {
210  unsigned types = state.getCliOptions().getOptType();
211 #ifdef IDENTITY_COPYRIGHT
212  if (types & 1<<0)
213  //state.addMatcher(RegexMatcher(regCopyright::getType(), regCopyright::getRegex()));
214  state.addScanner(new hCopyrightScanner());
215 
216  if (types & 1<<1)
217  state.addScanner(new regexScanner("url", "copyright"));
218 
219  if (types & 1<<2)
220  state.addScanner(new regexScanner("email", "copyright", 1));
221 
222  if (types & 1<<3)
223  state.addScanner(new regexScanner("author", "copyright"));
224 #endif
225 
226 #ifdef IDENTITY_IPRA
227  if (types & 1<<0)
228  state.addScanner(new regexScanner("ipra", "ipra"));
229 #endif
230 
231 #ifdef IDENTITY_ECC
232  if (types & 1<<0)
233  state.addScanner(new regexScanner("ecc", "ecc"));
234 #endif
235 
236 #ifdef IDENTITY_KW
237  if (types & 1<<0)
238  state.addScanner(new regexScanner("keyword", "keyword"));
239 #endif
240 }
241 
248 scanner* makeRegexScanner(const std::string& regexDesc, const std::string& defaultType) {
249  #define RGX_FMT_SEPARATOR "@@"
250  auto fmtRegex = rx::regex(
251  "(?:([[:alpha:]]+)" RGX_FMT_SEPARATOR ")?(?:([[:digit:]]+)" RGX_FMT_SEPARATOR ")?(.*)",
252  rx::regex_constants::icase
253  );
254 
255  rx::match_results<std::string::const_iterator> match;
256  if (rx::regex_match(regexDesc.begin(), regexDesc.end(), match, fmtRegex))
257  {
258  std::string type(match.length(1) > 0 ? match.str(1) : defaultType.c_str());
259 
260  int regId = match.length(2) > 0 ? std::stoi(std::string(match.str(2))) : 0;
261 
262  if (match.length(3) == 0)
263  return 0; // nullptr
264 
265  std::istringstream stream;
266  stream.str(type + "=" + match.str(3));
267  return new regexScanner(type, stream, regId);
268  }
269  return 0; // nullptr
270 }
271 
280 {
281  CopyrightState state(std::move(cliOptions));
282  addDefaultScanners(state);
283 
284  return state;
285 }
286 
296 bool saveToDatabase(const string& s, const list<match>& matches, unsigned long pFileId, int agentId, const CopyrightDatabaseHandler& copyrightDatabaseHandler)
297 {
298  if (!copyrightDatabaseHandler.begin())
299  {
300  return false;
301  }
302 
303  size_t count = 0;
304  for (auto m = matches.begin(); m != matches.end(); ++m)
305  {
306 
307  DatabaseEntry entry;
308  entry.agent_fk = agentId;
309  entry.content = cleanMatch(s, *m);
310  entry.copy_endbyte = m->end;
311  entry.copy_startbyte = m->start;
312  entry.pfile_fk = pFileId;
313  entry.type = m->type;
314 
315  if (entry.content.length() != 0)
316  {
317  ++count;
318  if (!copyrightDatabaseHandler.insertInDatabase(entry))
319  {
320  copyrightDatabaseHandler.rollback();
321  return false;
322  };
323  }
324  }
325 
326  return copyrightDatabaseHandler.commit();
327 }
328 
337 void matchFileWithLicenses(const string& sContent, unsigned long pFileId, CopyrightState const& state, int agentId, CopyrightDatabaseHandler& databaseHandler)
338 {
339  list<match> l;
340  const list<unptr::shared_ptr<scanner>>& scanners = state.getScanners();
341  for (auto sc = scanners.begin(); sc != scanners.end(); ++sc)
342  {
343  (*sc)->ScanString(sContent, l);
344  }
345  saveToDatabase(sContent, l, pFileId, agentId, databaseHandler);
346 }
347 
361 void matchPFileWithLicenses(CopyrightState const& state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler& databaseHandler)
362 {
363  char* pFile = databaseHandler.getPFileNameForFileId(pFileId);
364 
365  if (!pFile)
366  {
367  cout << "File not found " << pFileId << endl;
368  bail(8);
369  }
370 
371  char* fileName = NULL;
372  {
373 #pragma omp critical (repo_mk_path)
374  fileName = fo_RepMkPath("files", pFile);
375  }
376  if (fileName)
377  {
378  string s;
379  ReadFileToString(fileName, s);
380 
381  matchFileWithLicenses(s, pFileId, state, agentId, databaseHandler);
382 
383  free(fileName);
384  free(pFile);
385  }
386  else
387  {
388  cout << "PFile not found in repo " << pFileId << endl;
389  bail(7);
390  }
391 }
392 
405 bool processUploadId(const CopyrightState& state, int agentId, int uploadId, CopyrightDatabaseHandler& databaseHandler, bool ignoreFilesWithMimeType)
406 {
407  vector<unsigned long> fileIds = databaseHandler.queryFileIdsForUpload(agentId, uploadId, ignoreFilesWithMimeType);
408 
409 #pragma omp parallel num_threads(THREADS)
410  {
411  CopyrightDatabaseHandler threadLocalDatabaseHandler(databaseHandler.spawn());
412 
413  size_t pFileCount = fileIds.size();
414 #pragma omp for
415  for (size_t it = 0; it < pFileCount; ++it)
416  {
417  unsigned long pFileId = fileIds[it];
418 
419  if (pFileId == 0)
420  {
421  continue;
422  }
423 
424  matchPFileWithLicenses(state, agentId, pFileId, threadLocalDatabaseHandler);
425 
427  }
428  }
429 
430  return true;
431 }
432 
439 pair<string, list<match>> processSingleFile(const CopyrightState& state,
440  const string fileName)
441 {
442  const list<unptr::shared_ptr<scanner>>& scanners = state.getScanners();
443  list<match> matchList;
444 
445  // Read file into one string
446  string s;
447  if (!ReadFileToString(fileName, s))
448  {
449  // File error
450  s = "";
451  }
452  else
453  {
454  for (auto sc = scanners.begin(); sc != scanners.end(); ++sc)
455  {
456  (*sc)->ScanString(s, matchList);
457  }
458  }
459  return make_pair(s, matchList);
460 }
461 
469 void appendToJson(const std::string fileName,
470  const std::pair<string, list<match>> resultPair, bool &printComma)
471 {
472  Json::Value result;
473 #if JSONCPP_VERSION_HEXA < ((1 << 24) | (4 << 16))
474  // Use FastWriter for versions below 1.4.0
475  Json::FastWriter jsonWriter;
476 #else
477  // Since version 1.4.0, FastWriter is deprecated and replaced with
478  // StreamWriterBuilder
479  Json::StreamWriterBuilder jsonWriter;
480  jsonWriter["commentStyle"] = "None";
481  jsonWriter["indentation"] = "";
482 #endif
483 
484  if (resultPair.first.empty())
485  {
486  result["file"] = fileName;
487  result["results"] = "Unable to read file";
488  }
489  else
490  {
491  list<match> resultList = resultPair.second;
492  Json::Value results;
493  for (auto m : resultList)
494  {
495  Json::Value j;
496  j["start"] = m.start;
497  j["end"] = m.end;
498  j["type"] = m.type;
499  j["content"] = cleanMatch(resultPair.first, m);
500  results.append(j);
501  }
502  result["file"] = fileName;
503  result["results"] = results;
504  }
505  // Thread-Safety: output all matches JSON at once to STDOUT
506 #pragma omp critical (jsonPrinter)
507  {
508  if (printComma)
509  {
510  cout << "," << endl;
511  }
512  else
513  {
514  printComma = true;
515  }
516  string jsonString;
517 #if JSONCPP_VERSION_HEXA < ((1 << 24) | (4 << 16))
518  // For version below 1.4.0, every writer append `\n` at end.
519  // Find and replace it.
520  jsonString = jsonWriter.write(result);
521  jsonString.replace(jsonString.find("\n"), string("\n").length(), "");
522 #else
523  // For version >= 1.4.0, \n is not appended.
524  jsonString = Json::writeString(jsonWriter, result);
525 #endif
526  cout << " " << jsonString << flush;
527  }
528 }
529 
535 void printResultToStdout(const std::string fileName,
536  const std::pair<string, list<match>> resultPair)
537 {
538  if (resultPair.first.empty())
539  {
540  cout << fileName << " :: Unable to read file" << endl;
541  return;
542  }
543  stringstream ss;
544  ss << fileName << " ::" << endl;
545  // Output matches
546  list<match> resultList = resultPair.second;
547  for (auto m = resultList.begin(); m != resultList.end(); ++m)
548  {
549  ss << "\t[" << m->start << ':' << m->end << ':' << m->type << "] '"
550  << cleanMatch(resultPair.first, *m)
551  << "'" << endl;
552  }
553  // Thread-Safety: output all matches (collected in ss) at once to cout
554  cout << ss.str();
555 }
Store the options sent through the CLI.
unsigned int getOptType() const
Get the opt type set by CliOptions.
Manages database related requests for agent.
Definition: database.hpp:53
std::vector< unsigned long > queryFileIdsForUpload(int agentId, int uploadId, bool ignoreFilesWithMimeType)
Get the list of pfile ids on which the given agent has no findings for a given upload.
Definition: database.cc:306
bool insertInDatabase(DatabaseEntry &entry) const
Insert a finding in database.
Definition: database.cc:353
CopyrightDatabaseHandler spawn() const
Spawn/fork a new database handler and return it.
Definition: database.cc:41
Holds information about state of one agent.
const CliOptions & getCliOptions() const
Get the CliOptions set by user.
void addScanner(scanner *scanner)
Add scanner to state.
const std::list< unptr::shared_ptr< scanner > > & getScanners() const
Get available scanner s.
Maps agent data to database schema.
Definition: database.hpp:25
int copy_startbyte
Definition: database.hpp:44
std::string content
Definition: database.hpp:31
std::string type
Type of statement found.
Definition: database.hpp:43
int copy_endbyte
Definition: database.hpp:45
bool commit() const
COMMIT a transaction block in DB.
bool begin() const
BEGIN a transaction block in DB.
char * getPFileNameForFileId(unsigned long pfileId) const
Get the file name of a give pfile id.
bool rollback() const
ROLLBACK a transaction block in DB.
DB wrapper for agents.
Implementation of scanner class for copyright.
Definition: copyscan.hpp:18
Provides a regex scanner using predefined regexs.
Definition: regscan.hpp:21
Abstract class to provide interface to scanners.
Definition: scanners.hpp:52
void matchPFileWithLicenses(CopyrightState const &state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler &databaseHandler)
Get the file contents, scan for statements and save findings to database.
scanner * makeRegexScanner(const std::string &regexDesc, const std::string &defaultType)
Make a boost regex scanner object based on regex desc and type.
bool parseCliOptions(int argc, char **argv, CliOptions &dest, std::vector< std::string > &fileNames, std::string &directoryToScan)
Parse the options sent by CLI to CliOptions object.
pair< string, list< match > > processSingleFile(const CopyrightState &state, const string fileName)
int queryAgentId(PGconn *dbConn)
Get agent id, exit if agent id is incorrect.
static void addDefaultScanners(CopyrightState &state)
Add default scanners to the agent state.
void matchFileWithLicenses(const string &sContent, unsigned long pFileId, CopyrightState const &state, int agentId, CopyrightDatabaseHandler &databaseHandler)
Scan a given file with all available scanners and save findings to database.
int writeARS(int agentId, int arsId, int uploadId, int success, const fo::DbManager &dbManager)
Call C function fo_WriteARS() and translate the arguments.
CopyrightState getState(CliOptions &&cliOptions)
Create a new state for the current agent based on CliOptions.
bool processUploadId(const CopyrightState &state, int agentId, int uploadId, CopyrightDatabaseHandler &databaseHandler, bool ignoreFilesWithMimeType)
Process a given upload id, scan from statements and add to database.
void appendToJson(const std::string fileName, const std::pair< string, list< match >> resultPair, bool &printComma)
void printResultToStdout(const std::string fileName, const std::pair< string, list< match >> resultPair)
void bail(int exitval)
Disconnect with scheduler returning an error code and exit.
bool saveToDatabase(const string &s, const list< match > &matches, unsigned long pFileId, int agentId, const CopyrightDatabaseHandler &copyrightDatabaseHandler)
Save findings to the database if agent was called by scheduler.
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:37
FUNCTION int fo_WriteARS(PGconn *pgConn, int ars_pk, int upload_pk, int agent_pk, const char *tableName, const char *ars_status, int ars_success)
Write ars record.
Definition: libfossagent.c:214
FUNCTION int fo_GetAgentKey(PGconn *pgConn, const char *agent_name, long Upload_pk, const char *rev, const char *agent_desc)
Get the latest enabled agent key (agent_pk) from the database.
Definition: libfossagent.c:158
char * fo_RepMkPath(const char *Type, char *Filename)
Given a filename, construct the full path to the file.
Definition: libfossrepo.c:352
void fo_scheduler_disconnect(int retcode)
Disconnect the scheduler connection.
void fo_scheduler_heart(int i)
This function must be called by agents to let the scheduler know they are alive and how many items th...
char * fo_sysconfig(const char *sectionname, const char *variablename)
gets a system configuration variable from the configuration data.
fo_dbManager * dbManager
fo_dbManager object
Definition: process.c:16
bool ReadFileToString(const string &fileName, string &out)
Utility: read file to string from scanners.h.
Definition: scanners.cc:21
Store the results of a regex match.
Definition: scanners.hpp:28