FOSSology  4.4.0
Open Source License Compliance by Open Source Software
regexscan.c
Go to the documentation of this file.
1 /*
2  regexscan: Scan file(s) for regular expression(s)
3 
4  SPDX-FileCopyrightText: © 2013 Hewlett-Packard Development Company, L.P.
5 
6  SPDX-License-Identifier: GPL-2.0-only
7 */
43 #include <stdlib.h>
44 #include <stdio.h>
45 #include <unistd.h>
46 #include <string.h>
47 #include <ctype.h>
48 #include <signal.h>
49 #include <libgen.h>
50 
51 #include <regex.h>
52 #include <stdbool.h>
53 
54 #include "libfossology.h"
55 
56 #define MAXCMD 4096
57 char SQL[256];
58 
59 #define myBUFSIZ 2048
60 
61 /*
62 #ifdef COMMIT_HASH
63 char BuildVersion[]="Build version: " COMMIT_HASH ".\n";
64 #endif
65 */
66 
67 PGconn *pgConn = NULL;
68 
79 int regexScan(regex_t *regex, char *regexStr, FILE *scanFilePtr, char *fileName)
80 {
81  int retCode;
82 
83 // regex_t regex;
84  bool match = false; /* regex match found indicator */
85 
86  char msgBuff[2500];
87  char textBuff[2000]; /* line buffer for regex match processing */
88 
89  regmatch_t rm[1];
90  int lineCount = 0;
91 
92  /* Now scan the file for regex line by line */
93  while (fgets(textBuff, 1024, scanFilePtr) != NULL)
94  {
95  lineCount++; /* Another line read */
96  retCode = regexec(regex, textBuff, 1, rm, 0); /* nmatch = 1, matchptr = rm */
97  if (!retCode)
98  {
99  sprintf(msgBuff, "%s: regex found at line %d at position %d. -> %.*s \n",
100  fileName, lineCount, rm[0].rm_so+1, rm[0].rm_eo-rm[0].rm_so, textBuff + rm[0].rm_so);
101  puts(msgBuff);
102  if (!match)
103  {
104  match = true; /* Indicate we've had at least one match */
105  }
106  }
107  else if (retCode == REG_NOMATCH)
108  {
109  /* Skip the "no match" retCode */
110  }
111  else
112  {
113  regerror(retCode, regex, msgBuff, sizeof(msgBuff));
114  fprintf(stderr, "Out of memory? - regex match failure: %s\n", msgBuff);
115  fclose(scanFilePtr);
116  return 3;
117  }
118  }
119 
120  /* Report if no matches found */
121  if (!match)
122  {
123  sprintf(msgBuff, "%s: %s not found\n", fileName, regexStr);
124  puts(msgBuff);
125  }
126 
127  /* clean up and exit */
128 // regfree(&regex);
129  fclose(scanFilePtr);
130  return 0;
131 }
132 
141 int pfileNumToNames(char *pfileNum, char *pfileRepoName, char *pfileRealName)
142 {
143  char sqlSelect[256];
144  PGresult *result;
145 
146  /* Attempt to locate the appropriate pFile_pk record */
147  sprintf(sqlSelect, "SELECT pfile_sha1, pfile_md5, pfile_size, ufile_name FROM"
148  " pfile, uploadtree WHERE pfile_fk = pfile_pk and pfile_pk = '%s'", pfileNum);
149  result = PQexec(pgConn, sqlSelect);
150 
151  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
152 
153  /* confirm a sane result set */
154  if (PQntuples(result) == 0)
155  {
156  PQclear(result);
157 
158  /* Not found */
159  fprintf(stderr, "Database does not contain pfile_pk: %s\n", pfileNum);
160  return 1;
161  }
162  else if (PQntuples(result) != 1)
163  {
164  PQclear(result);
165 
166  /* Not found */
167  fprintf(stderr, "Database contains multiple pfile_pk: %s\n", pfileNum);
168  return 2;
169  }
170  /* We've managed to locate the one and only pfile_pk record. Build the filePath string */
171  /* Concatenate first row fields 0, 1 and 2 */
172  sprintf(pfileRepoName, "%s.%s.%s", PQgetvalue(result, 0, 0), PQgetvalue(result, 0, 1), PQgetvalue(result, 0, 2));
173  /* and extract the actual filename from field 4 - uploadtree.ufile_name */
174  sprintf(pfileRealName, "%s", PQgetvalue(result, 0, 3));
175 
176 // fprintf(stderr, "fileName is:%s\n", pFileName);
177  PQclear(result);
178  return 0;
179 }
180 
181 
190 int regexScanUpload(char *uploadNum, char *regexStr)
191 {
192  char sqlSelect[256];
193  PGresult *result, *pfileResult;
194 
195  int fileCount, i, retCode;
196 
197  char fileRealName[1000];
198  char fileRepoName[1000];
199 
200  FILE *scanFilePtr;
201 
202  regex_t regex;
203 
204  /* Ensure uploadNum is "valid" then obtain a list of pfile entries and scan them */
205  sprintf(sqlSelect, "SELECT upload_pk, upload_mode, upload_filename from upload where upload_pk = '%s'", uploadNum);
206  result = PQexec(pgConn, sqlSelect);
207 
208  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
209 
210  /* confirm a sane result set */
211  if (PQntuples(result) == 0)
212  {
213  fprintf(stderr, "No uploads appear to be available here!\n");
214  PQclear(result);
215  return 0; /* nothing found to scan */
216  }
217 
218  /* Next ensure that uploadNum was successfully uploaded */
219  /* We'll only look at upload_pk entries that have successfully run ununpack (64) and adj2nest (32) */
220  if ((atoi(PQgetvalue(result, 0, 1)) & 96) != 96)
221  {
222  fprintf(stderr, "Upload %s was not successfully processed after upload!\n", uploadNum);
223  PQclear(result);
224  return 0; /* nothing found to scan */
225  }
226 
227  /* Now get our list of required pfile entries for this upload */
228  sprintf(sqlSelect, "SELECT uploadtree.pfile_fk, ufile_name from uploadtree, upload"
229  " where upload_fk = upload_pk and uploadtree.pfile_fk <> 0 and ufile_mode = 32768 and upload_pk = '%s'", uploadNum);
230  result = PQexec(pgConn, sqlSelect);
231 
232  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
233 
234  fileCount = PQntuples(result);
235 // fprintf(stderr, "Located %d files to process.\n", fileCount);
236 
237  /* Compile the regex for improved performance */
238  retCode = regcomp(&regex, regexStr, REG_ICASE+REG_EXTENDED);
239  if (retCode)
240  {
241  fprintf(stderr, "regex %s failed to compile\n", regexStr);
242  return 1;
243  }
244 
245  /* Scan the files we've found for this upload */
246  for (i=0; i<fileCount; i++)
247  {
248  /* Attempt to locate the appropriate pFile_pk record */
249  sprintf(sqlSelect, "SELECT pfile_sha1, pfile_md5, pfile_size, ufile_name"
250  " FROM pfile, uploadtree WHERE pfile_fk = pfile_pk and pfile_pk = '%s'", PQgetvalue(result, i, 0));
251  pfileResult = PQexec(pgConn, sqlSelect);
252 
253  if (fo_checkPQresult(pgConn, pfileResult, sqlSelect, __FILE__, __LINE__)) return 0;
254 
255  /* confirm a sane result set */
256  if (PQntuples(pfileResult) == 1)
257  {
258  /* For each pfile value grind through the regex scan process */
259 
260  /* Locate and construct the appropriate full name from pfile table based upon pfile_pk value */
261  if (pfileNumToNames(PQgetvalue(result, i, 0), fileRepoName, fileRealName) != 0)
262  {
263  fprintf(stderr, "ERROR: Unable to locate pfile_pk '%s'\n", PQgetvalue(result, i, 0));
264  return 0;
265  }
266 
267  /* Use fo_RepFread() for access. It uses fo_RepMkPath() to map name to full path. */
268  scanFilePtr = fo_RepFread("files", fileRepoName);
269  if (!scanFilePtr)
270  {
271  fprintf(stderr, "ERROR: Unable to open '%s/%s'\n", "files", fileRepoName);
272  return 0;
273  }
274 
275  /* Call scan function. Note that we'll need to "Humanize" the fileName at some point. */
276  regexScan(&regex, regexStr, scanFilePtr, fileRealName);
277  }
278  else
279  {
280  fprintf(stderr, "WARNING: File: %s - Located %d instances of pfile_pk %s ! Size = %s bytes!\n",
281  PQgetvalue(result, i, 1), PQntuples(pfileResult), PQgetvalue(result, i, 0), PQgetvalue(pfileResult, i, 2));
282  }
283  }
284  /* return the number of scanned files */
285  return i;
286 }
287 
288 
294 void Usage (char *Name)
295 {
296  printf("Usage: %s [options] [id [id ...]]\n",Name);
297  printf(" -i :: initialize the database, then exit.\n");
298  printf(" -c SYSCONFDIR :: FOSSology configuration directory.\n");
299  printf(" -h :: show available command line options.\n");
300  printf(" -v :: increase agent logging verbosity.\n");
301  printf(" -r :: regex expression to load from command line.\n");
302  printf(" filename :: filename to process with regex.\n");
303 } /* Usage() */
304 
305 /*********************************************************/
306 int main (int argc, char *argv[])
307 {
308  int nonoptargs;
309  int c, retCode;
310 
311  regex_t regex;
312 
313  char regexStr[1024]; /* string storage for the regex expression */
314  bool regexSet = false;
315 
316  char fileName[1000];
317  FILE *scanFilePtr;
318 
319  char uploadNum[10];
320 
321  int scannedCount = 0;
322 
323  int user_pk;
324  long UploadPK=-1;
325 
326  char *COMMIT_HASH;
327  char *VERSION;
328  char agent_rev[myBUFSIZ];
329 
330  /* connect to scheduler. Noop if not run from scheduler. */
331  fo_scheduler_connect(&argc, argv, &pgConn);
332 
333 /*
334  Version reporting.
335 */
336  COMMIT_HASH = fo_sysconfig("regexscan", "COMMIT_HASH");
337  VERSION = fo_sysconfig("regexscan", "VERSION");
338  sprintf(agent_rev, "%s.%s", VERSION, COMMIT_HASH);
339 #ifdef REGEX_DEBUG
340  fprintf(stdout, "regexscan reports version info as '%s.%s'.\n", VERSION, COMMIT_HASH);
341 #endif
342 
343  /* Process command-line */
344  while((c = getopt(argc,argv,"chir:v")) != -1)
345  {
346  switch(c)
347  {
348  case 'c':
349  break; /* handled by fo_scheduler_connect() */
350  case 'i':
351  PQfinish(pgConn);
352  return(0);
353  case 'r':
354  sprintf(regexStr, "%s", optarg);
355  regexSet = true;
356  break;
357  case 'v':
358  agent_verbose++;
359  break;
360  case 'h':
361  default:
362  Usage(argv[0]);
363  fflush(stdout);
364  PQfinish(pgConn);
365  exit(-1);
366  }
367  }
368 
369  /* Sanity check for regex value required here. */
370  if (!regexSet)
371  {
372  fprintf (stderr, "No regex value has been requested!\n");
373  PQfinish(pgConn);
375  return 1;
376  }
377 
378  /* process filename after switches. How many non-option arguments are there ? */
379  nonoptargs = argc - optind; /* total argument count minus the option count */
380 
381  if (nonoptargs == 0)
382  {
383  /* Assume it was a scheduler call */
384  user_pk = fo_scheduler_userID();
385 
386  while(fo_scheduler_next())
387  {
388  UploadPK = atol(fo_scheduler_current());
389 
390  printf("UploadPK is: %ld\n", UploadPK);
391  sprintf(uploadNum, "%ld", UploadPK);
392  scannedCount = regexScanUpload(uploadNum, regexStr);
393  if (scannedCount == 0)
394  {
395  fprintf(stderr, "Failed to successfully scan: upload - %s!\n", uploadNum);
396  }
397  }
398  }
399  else
400  {
401  /* File access initialization - For Stage 3 use first arg as fileName */
402  sprintf(fileName, "%s", argv[optind]); /* Grab first non-switch argument as filename */
403 
404  scanFilePtr = fopen(fileName, "r");
405  if (!scanFilePtr)
406  {
407  fprintf(stderr, "ERROR: Unable to open '%s'\n", fileName);
408  PQfinish(pgConn);
410  }
411 
412  /* Compile the regex for improved performance */
413  retCode = regcomp(&regex, regexStr, REG_ICASE+REG_EXTENDED);
414  if (retCode)
415  {
416  fprintf(stderr, "regex %s failed to compile\n", regexStr);
417  PQfinish(pgConn);
419  }
420 
421  /* Now call the function that scans a file for a regex */
422  retCode = regexScan(&regex, (char *)regexStr, scanFilePtr, (char *)fileName);
423 // retCode = regexScan(uploadNum, regexStr);
424  if (retCode != 0)
425  {
426  fprintf(stderr, "Failed to successfully scan: %s!\n", fileName);
427  }
428 
429  }
430 
431  PQfinish(pgConn);
433 
434  return 0;
435 } /* main() */
436 
int fo_checkPQresult(PGconn *pgConn, PGresult *result, char *sql, char *FileID, int LineNumb)
Check the result status of a postgres SELECT.
Definition: libfossdb.c:170
The main FOSSology C library.
FILE * fo_RepFread(char *Type, char *Filename)
Perform an fopen for reading only.
Definition: libfossrepo.c:613
void fo_scheduler_disconnect(int retcode)
Disconnect the scheduler connection.
char * fo_sysconfig(const char *sectionname, const char *variablename)
gets a system configuration variable from the configuration data.
int agent_verbose
Common verbose flags for the agents, this is used so that the scheduler can change the verbose level ...
int fo_scheduler_userID()
Gets the id of the user that created the job that the agent is running.
char * fo_scheduler_current()
Get the last read string from the scheduler.
char * fo_scheduler_next()
Get the next data to process from the scheduler.
void fo_scheduler_connect(int *argc, char **argv, PGconn **db_conn)
Establish a connection between an agent and the scheduler.
void Usage(char *Name)
Usage description for this regexscan agent.
Definition: regexscan.c:294
int regexScanUpload(char *uploadNum, char *regexStr)
Scan an Upload for a regex - regular expression. gets a list of files in an upload and calls regexS...
Definition: regexscan.c:190
int regexScan(regex_t *regex, char *regexStr, FILE *scanFilePtr, char *fileName)
Scan a file for a regex - regular expression. the regex is compiled in this function for performanc...
Definition: regexscan.c:79
char SQL[256]
For DB.
Definition: regexscan.c:57
PGconn * pgConn
Database connection.
Definition: regexscan.c:67
int pfileNumToNames(char *pfileNum, char *pfileRepoName, char *pfileRealName)
Creates filenames from pfile_pk value.
Definition: regexscan.c:141
Store the results of a regex match.
Definition: scanners.hpp:28