FOSSology  4.4.0
Open Source License Compliance by Open Source Software
get-projects.php
1 #!/usr/bin/php
2 <?php
3 /*
4  get-projects.php
5  SPDX-FileCopyrightText: © 2007 Hewlett-Packard Development Company, L.P.
6 
7  SPDX-License-Identifier: GPL-2.0-only
8 */
9 
43 /*
44  * Defects:
45  * 1. if you don't pass in any parameters, weirdnes... need to check
46  * for that case.
47  */
48 // pathinclude below is dependent on having fossology installed.
49 require_once "FIXMETOBERELATIVE/pathinclude.php"; // brings in global $PROJECTSTATEDIR +
50 global $LIBDIR;
51 global $INCLUDEDIR;
52 require_once("$LIBDIR/lib_projxml.h.php");
53 require_once("$INCLUDEDIR/fm-paths.php");
54 
55 $usage = <<< USAGE
56 Usage: get-projects [-h] -f <file>
57  Where <file> is an uncompressed XML file, fully qualified
58  -h displays this usage.
59 
60 USAGE;
61 
62 $XML_input_file = NULL;
63 
64 for ($i = 1; $i < $argc; $i++) {
65  switch ($argv[$i]) {
66  case '-f':
67  $i++;
68  if (isset($argv[$i])) {
69  $XML_input_file = $argv[$i];
70  }
71  else {
72  die("ERROR: Must specify an uncompressed filename after -f");
73  }
74  break;
75  case '-h':
76  echo $usage;
77  exit(0);
78  default:
79  die("ERROR: Unknown argument: $argv[$i]\n$usage");
80  }
81 }
82 
83 // convention is to put the trailing / at the end of the dir so everyone else
84 // doesn't have to worry about it.
85 // FIX THIS: need to have env file created by install process.
86 
87 // set the destination directory, use /tmp if none supplied
88 if (empty($FMDIR))
89 {
90  $dest_dir = '/tmp/';
91 }
92 else
93 {
94  $dest_dir = $FMDIR; // from fm-paths.php in /usr/local/include
95 }
96 // create output directory with the date as part of the name.
97 
98 $yyyymmdd = date('Y-m-d');
99 $golden = '/golden.' . "$yyyymmdd" . '/';
100 $dest_dir .= $golden;
101 $wget_logs = $dest_dir . 'wget-logs/';
102 $log_data = $dest_dir . 'Logs-Data/';
103 $input_files = $dest_dir . 'Input-files/';
104 
105 // Create output directories. They should not exist
106 if (! is_dir("$dest_dir")){
107  exec("mkdir -p $dest_dir", $dummy, $rval);
108  if ($rval != 0) {
109  echo "ERROR: can't create output directory: $dest_dir";
110  exit(1);
111  }
112 }
113 if (! is_dir($wget_logs)){
114  exec("mkdir -p $wget_logs", $dummy, $rval);
115  if ($rval != 0) {
116  echo "ERROR: can't create output directory: $wget_logs\n";
117  exit(1);
118  }
119 }
120 if (! is_dir($log_data)){
121  exec("mkdir -p $log_data", $dummy, $rval);
122  if ($rval != 0) {
123  echo "ERROR: can't create output directory: $log_data\n";
124  exit(1);
125  }
126 }
127 if (! is_dir($input_files)){
128  exec("mkdir -p $input_files", $dummy, $rval);
129  if ($rval != 0) {
130  echo "ERROR: can't create output directory: $input_files\n";
131  exit(1);
132  }
133 }
134 
135 // make sure we have some sort of valid input (e.g. gp -f)
136 if (is_null($XML_input_file)){
137  echo "Error: null input file\n";
138  echo $usage;
139  exit(1);
140 }
141 
142 // simplexml.... can't deal with a compressed file, make sure it's not.
143 // possible enhancement is to uncompress the file if passed one....
144 // Note that the code below still may not catch all of them due to no
145 // standard naming convention.
146 $last = strrchr($XML_input_file, ".");
147 switch($last ) {
148  case '.gz':
149  echo $usage;
150  exit(1);
151  case '.bz2':
152  echo $usage;
153  exit(1);
154  case '.zip':
155  echo $usage;
156  exit(1);
157 }
158 
159 echo "Processing Xml file $XML_input_file\n";
160 
161 // parse the xml file and build the data structure. read_pfile returns
162 // the data struncture sorted (asending).
163 $fm_projects = array();
164 $fm_projects = read_pfile($XML_input_file);
165 
166 // Look for projects without any of the 3 archives. Log any found into
167 // skipped_fmprojects file and remove it from the fm_projects array.
168 
169 $projects_skipped = 0;
170 foreach($fm_projects as $rank => $key){
171  foreach ($key as $name => $values){
172  list(
173  $url_tgz,
174  $url_bz2,
175  $url_zip,
176  $homepage,
177  $short_desc,
178  $release_version,
179  $release_version_id,
180  $release_version_date
181  ) = $values;
182  # echo "We got:NAME:$name\nTG:$url_tgz\nBZ:$url_bz2\nZ:$url_zip\nHM:$homepage\nDesc:$short_desc\nRV:$release_version\nVID:$release_version_id\nVD:$release_version_date\n\n";
183 
184  }
185  if (($url_tgz == "") and ($url_bz2 == "") and ($url_zip == "")) {
186  $NoUrls = fopen("{$log_data}skipped_fmprojects", 'w') or
187  die("Can't open: $php_errormsg");
188  if (-1 ==
189  fwrite($NoUrls, "$rank $name $homepage $release_version\n")){
190  die("Can't write: $php_errormsg");
191  }
192  $projects_skipped++;
193  unset($fm_projects["$rank"]);
194  fclose($NoUrls);
195  }
196 }
197 
198 /*
199  * At this point the array should only have the
200  * (fm_projects - skipped projects). The working list will have AT LEAST
201  * 1 archive. Go get it.
202  * wget_url is called synchonisly(sp) since we only get 1 package and
203  * we need to know if what wget return status and what it got us.
204  */
205 
206 $skipped_uploads = array();
207 $uploads = array();
208 $mode = 's';
209 $uploads_scheduled = 0;
210 foreach ($fm_projects as $pkg_rank => $nkey){
211  foreach ($nkey as $pkg_name => $pkg_data){
212  // unpack the data so the code is easier to read
213 
214  list(
215  $tgz_url,
216  $bz2_url,
217  $zip_url,
218  $homepg,
219  $short_desc,
220  $ver,
221  $ver_id,
222  $ver_date
223  ) = $pkg_data;
224 
225  // Repackage the common data needed by all archives and wget_url
226  $common_data = array (
227  $short_desc,
228  $ver,
229  $ver_id,
230  $ver_date
231  );
232  // Set up the mode for wget_url
233  $gzip = '.gz';
234  $bzip2 = '.bz2';
235  $zip1 = '.zip';
236 
237  // Select the archives in the following order: .gz, .bz2, .zip
238  // There should be at least one of them.
239  echo "Trying project #$pkg_rank $pkg_name at:\n";
240  if ($tgz_url != "") {
241  $cnt = array_unshift($common_data,$tgz_url);
242  $tupload = wget_url($pkg_rank, $pkg_name, $gzip, $common_data, $mode);
243  }
244  elseif ($bz2_url != "") {
245  $cnt = array_unshift($common_data,$bz2_url);
246  $tupload = wget_url($pkg_rank, $pkg_name, $bzip2, $common_data, $mode);
247  }
248  elseif ($zip_url != "") {
249  $cnt = array_unshift($common_data,$zip_url);
250  $tupload = wget_url($pkg_rank, $pkg_name, $zip1, $common_data, $mode);
251  }
252  if(is_null($tupload['Null'])){
253  echo "Warning! There may have been an undetected error in the wget of $pkg_name\n";
254  echo "Check the wget logs in $wget_logs\n";
255  }
256  if(!(is_null($tupload['Compressed']))){
257  $uploads[] = $tupload['Compressed'];
258  $uploads_scheduled++;
259  echo "#$pkg_rank $pkg_name was downloaded and can be scheduled for an upload\n";
260  }
261  elseif(!(is_null($tupload['Uncompressed']))){
262  echo "WARNING! did not get a compressed archive from wget\n";
263  echo "Will Not upload $pkg_name\n";
264  $skipped_uploads[] = $tupload['Uncompressed'];
265  echo "\n-----\n"; // eye-candy, seperates packages in the output
266  continue;
267  }
268  echo "\n-----\n";
269  }
270 }
271 
272 // save the skipped uploads in a file (if any)
273 
274 $skipped_up = count($skipped_uploads);
275 if ($skipped_up != 0){
276  echo "Saving skipped uploads (downloaded files that were not compressed)\n";
277  echo
278 "There were $skipped_up skipped uploads, see $log_data/skipped_uploads for details\n";
279 
280  $SUP = fopen("$log_data/skipped_uploads", 'w')
281  or die("Can't open $log_data/skipped_uploads, $php_errormsg\n");
282  foreach($skipped_uploads as $skipped){
283  fwrite($SUP, "$skipped\n")
284  or die("Can't write to $log_data/skipped_uploads, $php_errormsg\n");
285  }
286  fclose($SUP);
287 }
288 
289 // at this point we have done the wgets and made a list of all the ones
290 // that succeeded. Now process that list into an input file for cp2foss
291 // as cp2foss will do the actual upload.
292 
293 create_cp2foss_ifile($uploads, "{$input_files}Freshmeat_to_Upload");
294 
295 /* Report results */
296 report($log_data);
297 
298 // end of Main....
299 
315 function create_cp2foss_ifile($uploads, $filename){
316 
317  $UPLOAD = fopen($filename, 'w') or
318  die("ERROR: can't open $filename, $php_errormsg\n");
319  $upload_count = count($uploads);
320  for ($uc=0; $uc<$upload_count; $uc++){
321  $parms = parse_fm_input($uploads[$uc]);
322 
323  list (
324  $rank,
325  $name,
326  $archive_path,
327  $description,
328  $version,
329  $version_id,
330  $version_date
331  ) = $parms;
332 
333  // don't write an entry that has no archive path (wget either returned
334  // an error or a file that was not a compressed archive).
335  if(!(isset($archive_path))){
336  continue;
337  }
338  //dbg("CCP2iF:R:$rank N:$name\nA:$archive_path\nD:$description V:$version, VID:$version_id $VD:$version_date\n");
339  $folder_path = '-p Freshmeat';
340  $alpha = '-A';
341  $name = "-n '$name-$version'";
342  // For now we are going to put the -A at the end to work around a defect in cp2foss.
343  $cp2foss_input = "$folder_path $name -a $archive_path -d '$description' $alpha\n";
344  //pdbg("Would write the following to the file:", $cp2foss_input);
345  fwrite($UPLOAD, $cp2foss_input) or
346  die("Errors: can't write $php_error_msg\n");
347  }
348  fclose($UPLOAD);
349  return;
350 }
362 function report($output_dir){
363 
364  global $projects_skipped;
365  global $uploads_scheduled;
366  global $input_files;
367 
368  $skipped_path = "{$output_dir}skipped_fmprojects";
369 
370  if ($uploads_scheduled){
371  printf("There were %d projects scheduled for uploading\nSee the {$input_files}Freshmeat_to_Upload\nfile for details\n\n", $uploads_scheduled);
372  }
373  // this doesn't make sense, fix later...
374  else{
375  printf("There were %d projects downloaded\nSee the $output_dir for details\n\n", $uploads_scheduled);
376  }
377  if ($projects_skipped != 0){
378  printf(
379  "There were %d skipped projects for this run\nSee the {$output_dir}skipped_fmprojects file for details\n", $projects_skipped);
380  }
381  else{
382  printf("There were %d skipped projects for this run\n", $projects_skipped);
383  echo ("Skipped projects are projects that had no compressed downloadable archives\n");
384  }
385  echo "To upload the files into the data-base run cp2foss using the Freshmeat_to_Upload file\n";
386  return;
387 }
388 
389 
390 
408 function wget_url($project_rank, $project_name, $ark_type, $proj_data, $mode){
409 
410  // NOTE: quite a few of the urls that are supposed to point to an archive
411  // really end up just depositing a file in various forms:
412  // *.html, *.cgi showfiles.php?xxxxxx, etc....
413  //
414  global $wget_logs;
415  global $log_data;
416  global $dest_dir;
417 
418  list($url,
419  $short_desc,
420  $ver,
421  $ver_id,
422  $ver_date
423  ) = $proj_data;
424 
425  $log_path = "$wget_logs" . "log.$project_name-" . "$project_rank";
426 
427  $wCmd .= "$proxy" . "wget -P $dest_dir -o $log_path $url ";
428 
429  if ($mode == 'a'){
430  echo "$url\n";
431  $wCmd .= ' &';
432  $lastline = system("$wCmd", $retval);
433  }
434 
435  if ($mode == 's'){
436  echo "$url\n";
437  // set these to null, so the caller knows which one got set.
438  $upload['Compressed'] = NULL;
439  $upload['Null'] = NULL;
440  $upload['Uncompressed'] = NULL;
441  exec("$wCmd", $dummy, $retval);
442  if ($retval != 0){
443  $WGF = fopen ("{$log_data}failed-wgets", 'a') or
444  die("Can't open: $php_errormsg\n");
445  if (-1 == fwrite($WGF, "$project_rank $project_name $url\n")) {
446  die("Can't write: $php_errormsg");
447  }
448  }
449  // wget can return a 0 (zero) exit status with 404 type errors, see
450  // _getfmpath below. So we check here if $archive_path is null
451  // if null, it's a failed wget, return null to indicate that.
452  //
453  elseif ($retval == 0){
454  $archive_path = _getfmpath($log_path);
455  if (is_null($archive_path)){
456  echo "Warning! returning NULL for an archive path\n";
457  return($upload);
458  }
459  // wget appears to have worked, now what type of file got downloaded?
460  // For now we will only process compressed archives, the rest of
461  // the files are usually a download of their front page, which
462  // is useless to upload.
463  $type = exec("file -b $archive_path", $dummy , $ret_val);
464  if (ereg('compressed data', $type)){
465  $upload['Compressed'] =
466  "'$project_rank' '$project_name' '$archive_path' '$short_desc' '$ver' '$ver_id' '$ver_date'";
467  }
468  else{
469  $upload['Uncompressed'] = "'$project_name' '$archive_path'";
470  }
471  $upload['Null'] = true;
472  }
473  }
474  // close the file? (Suceeded and WGF), or is it faster to leave open?
475  return($upload);
476 }
494 function _getfmpath($path){
495 
496  // The Freshmeat rdf uses a fake url and archive name so we need to get
497  // the path name of the downloaded archive by looking in the wget
498  // log file.
499 
500  $path_wanted = NULL;
501  $contents = file($path);
502  $size = count($contents);
503  $stat_line = $contents[$size-2];
504  if(ereg('^Removed ',$stat_line)){
505  // adjust for a different case, if wget downloads a .listing file
506  // it adjusts it be an index.html file instead.
507  $stat_line = $contents[$size-1];
508  }
509  //pdbg("_GFMP: Stat line is:\n$stat_line");
510  // We shouldn't find errors like this in the file, wget is supposed to
511  // have returned with 0 status.
512  if (ereg('ERROR 404:', $stat_line)){
513  echo "ERROR 404 found in file $dir_entry\n";
514  echo "Line was:\n$stat_line\n";
515  return($path_wanted);
516  }
517  elseif (ereg('ERROR 502:', $stat_line)){
518  echo "ERROR 502 found in file $dir_entry\n$stat_line\n";
519  echo "Line was:\n$stat_line\n";
520  return($path_wanted);
521  }
522  elseif (ereg('ERROR 503:', $stat_line)){
523  echo "ERROR 503 found in file $dir_entry\n$stat_line\n";
524  echo "Line was:\n$stat_line\n";
525  return($path_wanted);
526  }
527  elseif (ereg('ERROR 400:', $stat_line)){
528  echo "ERROR 400 found in file $dir_entry\n$stat_line\n";
529  echo "Line was:\n$stat_line\n";
530  return($path_wanted);
531  }
532  elseif (ereg('--no-check-certificate', $stat_line)){
533  echo "ERROR Secure connect to sourceforge.net needed: in file $dir_entry\n";
534  echo "Line was:\n$stat_line\n";
535  return($path_wanted);
536  }
537 
538 
539  $chunks = explode(' ', $stat_line);
540  //pdbg("_GFMP: Path Wanted:\n{$chunks[4]}");
541  // Strip the ` off the front
542  $stmp = ltrim($chunks[4], '`');
543  //pdbg("_GFMP: stmp:$stmp");
544  $path_wanted = rtrim($stmp, '\'');
545  //pdbg("_GFMP: path_wanted:$path_wanted");
546 
547  return($path_wanted);
548 }
Usage()
Print Usage statement.
Definition: fo_dbcheck.php:63
#define ERROR(...)
Definition: logging.h:79
FUNCTION void usage(char *name)
Definition: usage.c:18
if(!preg_match("/\s$projectGroup\s/", $groups) &&(posix_getgid() !=$gInfo[ 'gid']))
get monk license list of one specified uploadtree_id
Definition: migratetest.php:33
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Definition: nomos.h:308