FOSSology  4.4.0
Open Source License Compliance by Open Source Software
doctorBuffer_utils.c
Go to the documentation of this file.
1 /*
2  SPDX-FileCopyrightText: © 2006-2014 Hewlett-Packard Development Company, L.P.
3  SPDX-FileCopyrightText: © 2014 Siemens AG
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
8 #include "doctorBuffer_utils.h"
9 #define INVISIBLE (int) '\377'
10 
11 #include "nomos.h"
12 #include "list.h"
13 #include "util.h"
14 #include "nomos_regex.h"
15 
25 int compressDoctoredBuffer( char* textBuffer)
26 {
27  int previous=strlen(textBuffer);
28 
29  if(cur.docBufferPositionsAndOffsets)
30  g_array_free(cur.docBufferPositionsAndOffsets, TRUE);
31  cur.docBufferPositionsAndOffsets=collapseInvisible(textBuffer,INVISIBLE);
32 // cur.docBufferPositionsAndOffsets=collapseSpaces(textBuffer);
33 
34  int after = strlen(textBuffer);
35 
36  return previous - after;
37 }
38 
43 void removeHtmlComments(char* buf)
44 {
45  int f;
46  int g;
47  char* cp;
48  f = 0;
49  g = 0;
50  for (cp = buf; cp && *cp; cp++)
51  {
52  if ((*cp == '<') && (*(cp + 1) != '<') && (*(cp + 1) != ' '))
53  {
54 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
55  int x = strncasecmp(cp, "<string", 7);
56  printf("CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
57  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
58  *(cp+5), *(cp+6), x);
59 #endif /* DEBUG>5 && DOCTOR_DEBUG */
60  if (strncasecmp(cp, "<string", 7))
61  {
62  *cp = ' ';
63  if (*(cp + 1) != '-' || *(cp + 2) != '-')
64  {
65  f = 1;
66  }
67  }
68  }
69  else if (*cp == '&')
70  {
71 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
72  int x = strncasecmp(cp, "&copy;", 6);
73  printf("CHECK: %c%c%c%c%c%c == %d\n", *cp,
74  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
75  *(cp+5), x);
76 #endif /* DEBUG>5 && DOCTOR_DEBUG */
77  if (strncasecmp(cp, "&copy;", 6))
78  {
79  *cp = ' ';
80  g = 1;
81  }
82  }
83  else if (f && (*cp == '>'))
84  {
85  *cp = ' ';
86  f = 0;
87  }
88  else if (g && (*cp == ';'))
89  {
90  *cp = ' ';
91  g = 0;
92  }
93  else if (isEOL(*cp))
94  {
95  g = 0;
96  }
97  /* Don't remove text in an HTML comment (e.g., turn the flag off) */
98  else if ((*cp == '!') && f && (cp != buf) && (*(cp - 1) == ' '))
99  {
100  *cp = ' ';
101  f = 0;
102  }
103  else if (f || g)
104  {
105  // *cp = INVISIBLE; larry comment out this line, I do not think this logic is correct
106  }
107  else if ((*cp == '<') || (*cp == '>'))
108  {
109  *cp = ' ';
110  }
111  }
112 }
113 
120 void removeLineComments(char* buf)
121 {
122  char* cp;
123  char* MODULE_LICENSE = "MODULE_LICENSE";
124  cp = buf;
125  while (idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE | REG_NEWLINE | REG_EXTENDED))
126  {
127 #ifdef DOCTOR_DEBUG
128  dumpMatch(cp, "Found \"comment\"-text");
129 #endif /* DOCTOR_DEBUG */
130  cp += cur.regm.rm_so;
131  switch (*cp)
132  {
133  case '>':
134  *cp++ = ' ';
135  break;
136  case '@': /* texi special processing */
137  *cp++ = INVISIBLE;
138  if (strncasecmp(cp, "author", 6) == 0)
139  {
140  (void) memset(cp, ' ', 6);
141  cp += 6;
142  }
143  else if (strncasecmp(cp, "comment", 7) == 0)
144  {
145  (void) memset(cp, ' ', 7);
146  cp += 7;
147  }
148  else if (strncasecmp(cp, "center", 6) == 0)
149  {
150  (void) memset(cp, ' ', 6);
151  cp += 6;
152  }
153  else if (strncasecmp(cp, "rem", 3) == 0)
154  {
155  (void) memset(cp, ' ', 3);
156  cp += 3;
157  }
158  else if (*cp == 'c')
159  {
160  *cp++ = INVISIBLE;
161  if (strncasecmp(cp, " essay", 6) == 0)
162  {
163  (void) memset(cp, ' ', 6);
164  cp += 6;
165  }
166  }
167  break;
168  case '/': /* c++ style comment // */
169  if (cp && cp[0])
170  {
172  if (strstr(cp, MODULE_LICENSE) && '/' == cp[0])
173  {
174  (void) memset(cp, INVISIBLE, strlen(cp));
175  cp += strlen(cp);
176  }
177  else
178  {
179  (void) memset(cp, INVISIBLE, 2);
180  cp += 2;
181  }
182  }
183  break;
184  case '\\': /* c++ style comment // */
185  if (strncasecmp(cp + 1, "par ", 3) == 0)
186  {
187  (void) memset(cp, ' ', 4);
188  }
189  cp += 4;
190  break;
191  case 'r':
192  case 'R': /* rem */
193  case 'd':
194  case 'D': /* dnl */
195  (void) memset(cp, INVISIBLE, 3);
196  cp += 3;
197  break;
198  case 'x':
199  case 'X': /* xcomm */
200  (void) memset(cp, INVISIBLE, 5);
201  cp += 5;
202  break;
203  case 'c':
204  case 'C': /* comment */
205  (void) memset(cp, INVISIBLE, 7);
206  cp += 7;
207  break;
208  case '%': /* %%copyright: */
209  (void) memset(cp, INVISIBLE, 12);
210  cp += 12;
211  break;
212  }
213  }
214 }
215 
220 void cleanUpPostscript(char* buf)
221 {
222  char* cp;
223  char* x;
224  cp = buf;
225  while (idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED | REG_NEWLINE))
226  {
227 #ifdef DOCTOR_DEBUG
228  dumpMatch(cp, "FOUND postscript-thingy");
229 #endif /* DOCTOR_DEBUG */
230  x = cp + cur.regm.rm_so;
231  cp += cur.regm.rm_eo;
232  while (x < cp)
233  {
234  *x++ = ' '/*INVISIBLE*/;
235  }
236  }
237 }
238 
247 {
248  char* cp;
249  char* x;
250  for (cp = buf; *cp; cp++)
251  {
252  if (*cp == '\\')
253  {
254  x = cp + 1;
255  if (*x && (*x == 's'))
256  {
257  x++;
258  if (*x && ((*x == '+') || (*x == '-')))
259  {
260  x++;
261  }
262  while (*x && isdigit(*x))
263  {
264  x++;
265  }
266  }
267  else if (*x && *x == 'n')
268  {
269  x++;
270  }
271  memset(cp, /*INVISIBLE*/' ', (size_t) (x - cp));
272  }
273  }
274 }
275 
286 {
287  char* cp;
288  for (cp = buf; /*cp < end &&*/*cp; cp++)
289  {
290  if ((*cp == '\302') && (*(cp + 1) == '\251'))
291  {
292  cp += 1;
293  continue;
294  }
295  if (*cp & (char) 0x80)
296  {
297  *cp = INVISIBLE;
298  continue;
299  }
300  switch (*cp)
301  {
302  /*
303  Convert eol-characters AND some other miscellaneous
304  characters into spaces (due to comment-styles, etc.)
305  */
306  case '\a':
307  case '\t':
308  case '\n':
309  case '\r':
310  case '\v':
311  case '\f':
312  case '[':
313  case ']':
314  case '{':
315  case '}':
316  case '*':
317  case '=':
318  case '#':
319  case '$':
320  case '|':
321  case '%':
322  case '!':
323  case '?':
324  case '`':
325  case '"':
326  case '\'':
327  *cp = ' ';
328  break;
329  /* allow + only within the regex " [Mm]\+ " */
330  case '+':
331  if (*(cp + 1) == 0 || *(cp + 1) == ' ' || *(cp + 1) == '\t' || *(cp + 1) == '\n' || *(cp + 1) == '\r')
332  break;
333  else if (cp > buf + 1 && (*(cp - 1) == 'M' || *(cp - 1) == 'm') && *(cp - 2) == ' ' && *(cp + 1) == ' ')
334  {
335  /* no-op */
336  }
337  else
338  {
339  *cp = ' ';
340  }
341  break;
342  case '(':
343  if ((*(cp + 1) == 'C' || *(cp + 1) == 'c') && *(cp + 2) == ')')
344  {
345  cp += 2;
346  continue;
347  }
348  else
349  {
350  *cp = ' ';
351  }
352  break;
353  case ')':
354  case ',':
355  case ':':
356  case ';':
357  if (!isCR)
358  {
359  *cp = ' ';
360  }
361  break;
362  case '.':
363  if (!isCR)
364  {
365  *cp = INVISIBLE;
366  }
367  break;
368  case '<':
369  if (strncasecmp(cp, "<string", 7) == 0)
370  {
371  (void) memcpy(cp, " ", 7 * sizeof(char));
372  }
373  break;
374  /* CDB - Big #ifdef 0 left out */
375  case '\001':
376  case '\002':
377  case '\003':
378  case '\004':
379  case '\005':
380  case '\006':
381  case '\016':
382  case '\017':
383  case '\020':
384  case '\021':
385  case '\022':
386  case '\023':
387  case '\024':
388  case '\025':
389  case '\026':
390  case '\027':
391  case '\030':
392  case '\031':
393  case '\032':
394  case '\033':
395  case '\034':
396  case '\035':
397  case '\036':
398  case '\037':
399  case '~':
400  *cp = INVISIBLE;
401  break;
402 #ifdef DOCTOR_DEBUG
403  case ' ': case '/': case '-': case '@': case '&':
404  case '>': case '^': case '_':
405  case INVISIBLE:
406  break;
407  default:
408  if (!isalpha(*cp) && !isdigit(*cp))
409  {
410  printf("DEBUG: \\0%o @ %ld\n",
411  *cp & 0xff, cp-buf);
412  }
413  break;
414 #endif /* DOCTOR_DEBUG */
415  }
416  }
417 }
418 
426 void dehyphen(char* buf)
427 {
428  char* cp;
429 
430  for (cp = buf; idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); /*nada*/)
431  {
432 #ifdef DOCTOR_DEBUG
433  char* x;
434  x = cp + cur.regm.rm_so;
435  while ((x > cp) && !isspace(*x))
436  {
437  x--;
438  }
439  printf("Hey! hyphenated-word [");
440  for (++x; x <= (cp + cur.regm.rm_eo); x++)
441  {
442  printf("%c", *x);
443  }
444  while (!isspace(*x))
445  {
446  printf("%c", *x++);
447  }
448  printf("]\n");
449 
450 #endif /* DOCTOR_DEBUG */
451  cp += cur.regm.rm_so + 1;
452  *cp++ = INVISIBLE;
453  while (isspace(*cp))
454  {
455  *cp++ = INVISIBLE;
456  }
457  }
458 
459 }
460 
467 void removePunctuation(char* buf)
468 {
469  char* cp;
470  char* x;
471  for (cp = buf; idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); /*nada*/)
472  {
473  x = cp + cur.regm.rm_so;
474  cp += cur.regm.rm_eo - 1; /* leave ' ' alone */
475  while (x < cp)
476  {
477  *x++ = ' ';
478  }
479  cp++;
480  }
481  for (cp = buf; idxGrep(_UTIL_LATEX, cp, REG_ICASE); /*nada*/)
482  {
483  x = cp + cur.regm.rm_so;
484  cp += cur.regm.rm_eo;
485  while (x <= cp)
486  {
487  *x++ = ' ';
488  }
489  cp++;
490  }
491 }
492 
505 void ignoreFunctionCalls(char* buf)
506 {
507  char* cp;
508  char* x;
509  for (cp = buf; idxGrep(_UTIL_PRINT, cp, REG_ICASE); /*nada*/)
510  {
511  x = cp + cur.regm.rm_so;
512  cp += (cur.regm.rm_eo - 1);
513  if ((x > buf) && ((*(x - 1) == 'r') || (*(x - 1) == 't')))
514  {
515  continue;
516  }
517  while (x < cp)
518  {
519  *x++ = ' ';
520  }
521  cp++;
522  }
523 }
524 
530 void convertSpaceToInvisible(char* buf)
531 {
532  char* cp;
533  for (cp = buf; *cp; /*nada*/)
534  {
535  if (*cp++ == ' ')
536  {
537  while (*cp)
538  {
539  if (*cp == ' ')
540  {
541  *cp++ = INVISIBLE;
542  }
543  else if (*cp == INVISIBLE)
544  {
545  cp++;
546  }
547  else
548  {
549  break;
550  }
551  }
552  }
553  }
554 }
555 
575 void doctorBuffer(char *buf, int isML, int isPS, int isCR)
576 {
577 
578 // printf("\n ==============doctorBuffer is called============================== \n");
579 
580  // char *cp;
581  // char *x;
582 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG)
583  traceFunc("== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
584 #endif /* PROC_TRACE || DOCTOR_DEBUG */
585 
586  /*
587  * convert a buffer of multiple *stuff* to text-only, separated by spaces
588  * We really only care about text "in a license" here, so strip out
589  * comments and other unwanted punctuation.
590  */
591 #ifdef DOCTOR_DEBUG
592  printf("***** Processing %p (%d data bytes)\n", buf, (int)strlen(buf));
593  printf("----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
594 #endif /* DOCTOR_DEBUG */
595  /*
596  * step 1: take care of embedded HTML/XML and special HTML-chars like
597  * &quot; and &nbsp; -- but DON'T remove the text in an HTML comment.
598  * There might be licensing text/information in the comment!
599  *****
600  * Later on (in parseLicenses()) we search for URLs in the raw-text
601  */
602  if (isML)
603  {
604 #ifdef DOCTOR_DEBUG
605  printf("DEBUG: markup-languange directives found!\n");
606 #endif /* DOCTOR_DEBUG */
607  removeHtmlComments(buf);
608  }
609  /*
610  * step 2: remove comments that start at the beginning of a line, * like
611  * ^dnl, ^xcomm, ^comment, and //
612  */
613  removeLineComments(buf);
614  /*
615  * Step 3 - strip out crap at end-of-line on postscript documents
616  */
617 
618  if (isPS)
619  {
620  cleanUpPostscript(buf);
621 #ifdef DOCTOR_DEBUG
622  printf("DEBUG: postscript stuff detected!\n");
623 #endif /* DOCTOR_DEBUG */
624  }
625  /*
626  * - step 4: remove groff/troff font-size indicators, the literal
627  * string backslash-n and all backslahes, ala:
628  *==> perl -pe 's,\\s[+-][0-9]*,,g;s,\\s[0-9]*,,g;s/\\n//g;' |
629  f*/
631  /*
632  * - step 5: convert white-space to real spaces, and remove
633  * unnecessary punctuation, ala:
634  *==> tr -d '*=+#$|%.,:;!?()\\][\140\047\042' | tr '\011\012\015' ' '
635  *****
636  * NOTE: we purposely do NOT process backspace-characters here. Perhaps
637  * there's an improvement in the wings for this?
638  */
640  /*
641  * Look for hyphenations of words, to compress both halves into a sin-
642  * gle (sic) word. Regex == "[a-z]- [a-z]".
643  *****
644  * NOTE: not sure this will work based on the way we strip punctuation
645  * out of the buffer above -- work on this later.
646  */
647  dehyphen(buf);
648  /*
649  * - step 6: clean up miscellaneous punctuation, ala:
650  *==> perl -pe 's,[-_/]+ , ,g;s/print[_a-zA-Z]* //g;s/ / /g;'
651  */
652  removePunctuation(buf);
653  /*
654  * Ignore function calls to print routines: only concentrate on what's being
655  * printed (sometimes programs do print licensing information) -- but don't
656  * ignore real words that END in 'print', like footprint and fingerprint.
657  * Here, we take a risk and just look for a 't' (in "footprint"), or for an
658  * 'r' (in "fingerprint"). If someone has ever coded a print routine that
659  * is named 'rprint' or tprint', we're spoofed.
660  */
661  ignoreFunctionCalls(buf);
662  /*
663  * Convert the regex ' [X ]+' (where X is really the character #defined as
664  * INVISIBLE) to a single space (and a string of INVISIBLE characters).
665  */
667  /*
668  * garbage collect: eliminate all INVISIBLE characters in the buffer
669  */
670 #ifdef DOCTOR_DEBUG
671  int n =
672 #else
673  (void)
674 #endif
676 
677 #ifdef DOCTOR_DEBUG
678  printf("***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
679  (int)strlen(buf), n);
680  printf("+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
681 #endif /* DOCTOR_DEBUG */
682  return;
683 }
684 
685 #ifdef DOCTORBUFFER_OLD
686 void doctorBuffer_old(char *buf, int isML, int isPS, int isCR)
687 {
688  printf("Doctor Buffer old \n");
689  char *cp;
690  char *x;
691  int f;
692  int g;
693  int n;
694  char *MODULE_LICENSE = "MODULE_LICENSE";
695 
696 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG)
697  traceFunc("== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
698 #endif /* PROC_TRACE || DOCTOR_DEBUG */
699 
700  /*
701  * convert a buffer of multiple *stuff* to text-only, separated by spaces
702  * We really only care about text "in a license" here, so strip out
703  * comments and other unwanted punctuation.
704  */
705 #ifdef DOCTOR_DEBUG
706  printf("***** Processing %p (%d data bytes)\n", buf, (int)strlen(buf));
707  printf("----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
708 #endif /* DOCTOR_DEBUG */
709  /*
710  * step 1: take care of embedded HTML/XML and special HTML-chars like
711  * &quot; and &nbsp; -- but DON'T remove the text in an HTML comment.
712  * There might be licensing text/information in the comment!
713  *****
714  * Later on (in parseLicenses()) we search for URLs in the raw-text
715  */
716  if (isML) {
717 #ifdef DOCTOR_DEBUG
718  printf("DEBUG: markup-languange directives found!\n");
719 #endif /* DOCTOR_DEBUG */
720  f = 0;
721  g = 0;
722  for (cp = buf; cp && *cp; cp++) {
723  if ((*cp == '<') &&
724  (*(cp+1) != '<') &&
725  (*(cp+1) != ' ')) {
726 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
727  int x = strncasecmp(cp, "<string", 7);
728  printf("CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
729  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
730  *(cp+5), *(cp+6), x);
731 #endif /* DEBUG>5 && DOCTOR_DEBUG */
732  if (strncasecmp(cp, "<string", 7)) {
733  *cp = ' ';
734  if (*(cp+1) != '-' || *(cp+2) != '-') {
735  f = 1;
736  }
737  }
738  } else if (*cp == '&') {
739 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
740  int x = strncasecmp(cp, "&copy;", 6);
741  printf("CHECK: %c%c%c%c%c%c == %d\n", *cp,
742  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
743  *(cp+5), x);
744 #endif /* DEBUG>5 && DOCTOR_DEBUG */
745  if (strncasecmp(cp, "&copy;", 6)) {
746  *cp = ' ';
747  g = 1;
748  }
749  } else if (f && (*cp == '>')) {
750  *cp = ' ';
751  f = 0;
752  } else if (g && (*cp == ';')) {
753  *cp = ' ';
754  g = 0;
755  } else if (isEOL(*cp)) {
756  g = 0;
757  }
758  /* Don't remove text in an HTML comment (e.g., turn the flag off) */
759  else if ((*cp == '!') &&
760  f &&
761  (cp != buf) &&
762  (*(cp-1) == ' ')) {
763  *cp = ' ';
764  f = 0;
765  } else if (f || g) {
766  // *cp = INVISIBLE; larry comment out this line, I do not think this logic is correct
767  } else if ((*cp == '<') || (*cp == '>')) {
768  *cp = ' ';
769  }
770  }
771  }
772  /*
773  * step 2: remove comments that start at the beginning of a line, * like
774  * ^dnl, ^xcomm, ^comment, and //
775  */
776  cp = buf;
777  while (idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE|REG_NEWLINE|REG_EXTENDED)) {
778 #ifdef DOCTOR_DEBUG
779  dumpMatch(cp, "Found \"comment\"-text");
780 #endif /* DOCTOR_DEBUG */
781  cp += cur.regm.rm_so;
782  switch (*cp) {
783  case '>':
784  *cp++ = ' ';
785  break;
786  case '@': /* texi special processing */
787  *cp++ = INVISIBLE;
788  if (strncasecmp(cp, "author", 6) == 0) {
789  (void) memset(cp, ' ', 6);
790  cp += 6;
791  } else if (strncasecmp(cp, "comment", 7) == 0) {
792  (void) memset(cp, ' ', 7);
793  cp += 7;
794  } else if (strncasecmp(cp, "center", 6) == 0) {
795  (void) memset(cp, ' ', 6);
796  cp += 6;
797  }
798  else if (strncasecmp(cp, "rem", 3) == 0) {
799  (void) memset(cp, ' ', 3);
800  cp += 3;
801  } else if (*cp == 'c') {
802  *cp++ = INVISIBLE;
803  if (strncasecmp(cp, " essay", 6) == 0) {
804  (void) memset(cp, ' ', 6);
805  cp += 6;
806  }
807  }
808  break;
809  case '/': /* c++ style comment // */
810  if(cp && cp[0])
811  {
813  if (strstr(cp, MODULE_LICENSE) && '/' == cp[0])
814  {
815  (void) memset(cp, INVISIBLE, strlen(cp));
816  cp += strlen(cp);
817  }
818  else {
819  (void) memset(cp, INVISIBLE, 2);
820  cp += 2;
821  }
822  }
823  break;
824  case '\\': /* c++ style comment // */
825  if (strncasecmp(cp+1, "par ", 3) == 0) {
826  (void) memset(cp, ' ', 4);
827  }
828  cp += 4;
829  break;
830  case 'r':
831  case 'R': /* rem */
832  case 'd':
833  case 'D': /* dnl */
834  (void) memset(cp, INVISIBLE, 3);
835  cp += 3;
836  break;
837  case 'x':
838  case 'X': /* xcomm */
839  (void) memset(cp, INVISIBLE, 5);
840  cp += 5;
841  break;
842  case 'c':
843  case 'C': /* comment */
844  (void) memset(cp, INVISIBLE, 7);
845  cp += 7;
846  break;
847  case '%': /* %%copyright: */
848  (void) memset(cp, INVISIBLE, 12);
849  cp += 12;
850  break;
851  }
852  }
853  /*
854  * Step 3 - strip out crap at end-of-line on postscript documents
855  */
856  if (isPS) {
857 #ifdef DOCTOR_DEBUG
858  printf("DEBUG: postscript stuff detected!\n");
859 #endif /* DOCTOR_DEBUG */
860  cp = buf;
861  while (idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED|REG_NEWLINE)) {
862 #ifdef DOCTOR_DEBUG
863  dumpMatch(cp, "FOUND postscript-thingy");
864 #endif /* DOCTOR_DEBUG */
865  x = cp + cur.regm.rm_so;
866  cp += cur.regm.rm_eo;
867  while (x < cp) {
868  *x++ = ' '/*INVISIBLE*/;
869  }
870  }
871  }
872  /*
873  * - step 4: remove groff/troff font-size indicators, the literal
874  * string backslash-n and all backslahes, ala:
875  *==> perl -pe 's,\\s[+-][0-9]*,,g;s,\\s[0-9]*,,g;s/\\n//g;' |
876  f*/
877  for (cp = buf; *cp; cp++) {
878  if (*cp == '\\') {
879  x = cp + 1;
880  if (*x && (*x == 's')) {
881  x++;
882  if (*x && ((*x == '+') || (*x == '-'))) {
883  x++;
884  }
885  while (*x && isdigit(*x)) {
886  x++;
887  }
888  } else if (*x && *x == 'n') {
889  x++;
890  }
891  memset(cp, /*INVISIBLE*/' ', (size_t) (x-cp));
892  }
893  }
894  /*
895  * - step 5: convert white-space to real spaces, and remove
896  * unnecessary punctuation, ala:
897  *==> tr -d '*=+#$|%.,:;!?()\\][\140\047\042' | tr '\011\012\015' ' '
898  *****
899  * NOTE: we purposely do NOT process backspace-characters here. Perhaps
900  * there's an improvement in the wings for this?
901  */
902  for (cp = buf; /*cp < end &&*/ *cp; cp++) {
903  if ((*cp == '\302') && (*(cp+1) == '\251')) {
904  cp += 2;
905  continue;
906  }
907  if (*cp & (char) 0x80) {
908  *cp = INVISIBLE;
909  continue;
910  }
911  switch (*cp) {
912  /*
913  Convert eol-characters AND some other miscellaneous
914  characters into spaces (due to comment-styles, etc.)
915  */
916  case '\a': case '\t': case '\n': case '\r':
917  case '\v': case '\f': case '[': case ']':
918  case '{': case '}': case '*': case '=':
919  case '#': case '$': case '|': case '%': case '!':
920  case '?': case '`': case '"': case '\'':
921  *cp = ' ';
922  break;
923  /* allow + only within the regex " [Mm]\+ " */
924  case '+':
925  if (*(cp+1) == 0 || *(cp+1) == ' ' || *(cp+1) == '\t' || *(cp+1) == '\n' || *(cp+1) == '\r') break;
926  else if (cp > buf+1 && (*(cp-1) == 'M' ||
927  *(cp-1) == 'm') && *(cp-2) == ' ' &&
928  *(cp+1) == ' ') {
929  f = 0; /* no-op */
930  }
931  else {
932  *cp = ' ';
933  }
934  break;
935  case '(':
936  if ((*(cp+1) == 'C' || *(cp+1) == 'c') &&
937  *(cp+2) == ')') {
938  cp += 2;
939  continue;
940  }
941  else {
942  *cp = ' ';
943  }
944  break;
945  case ')': case ',': case ':': case ';':
946  if (!isCR) {
947  *cp = ' ';
948  }
949  break;
950  case '.':
951  if (!isCR) {
952  *cp = INVISIBLE;
953  }
954  break;
955  case '<':
956  if (strncasecmp(cp, "<string", 7) == 0) {
957  (void) strncpy(cp, " ", 7);
958  }
959  break;
960  /* CDB - Big #ifdef 0 left out */
961  case '\001': case '\002': case '\003': case '\004':
962  case '\005': case '\006': case '\016': case '\017':
963  case '\020': case '\021': case '\022': case '\023':
964  case '\024': case '\025': case '\026': case '\027':
965  case '\030': case '\031': case '\032': case '\033':
966  case '\034': case '\035': case '\036': case '\037':
967  case '~':
968  *cp = INVISIBLE;
969  break;
970 #ifdef DOCTOR_DEBUG
971  case ' ': case '/': case '-': case '@': case '&':
972  case '>': case '^': case '_':
973  case INVISIBLE:
974  break;
975  default:
976  if (!isalpha(*cp) && !isdigit(*cp)) {
977  printf("DEBUG: \\0%o @ %ld\n",
978  *cp & 0xff, cp-buf);
979  }
980  break;
981 #endif /* DOCTOR_DEBUG */
982  }
983  }
984  /*
985  * Look for hyphenations of words, to compress both halves into a sin-
986  * gle (sic) word. Regex == "[a-z]- [a-z]".
987  *****
988  * NOTE: not sure this will work based on the way we strip punctuation
989  * out of the buffer above -- work on this later.
990  */
991  for (cp = buf; idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); /*nada*/) {
992 #ifdef DOCTOR_DEBUG
993  x = cp + cur.regm.rm_so;
994  while ((x > cp) && !isspace(*x)) {
995  x--;
996  }
997  printf("Hey! hyphenated-word [");
998  for (++x; x <= (cp + cur.regm.rm_eo); x++) {
999  printf("%c", *x);
1000  }
1001  while (!isspace(*x)) {
1002  printf("%c", *x++);
1003  }
1004  printf("]\n");
1005 
1006 #endif /* DOCTOR_DEBUG */
1007  cp += cur.regm.rm_so + 1;
1008  *cp++ = INVISIBLE;
1009  while (isspace(*cp)) {
1010  *cp++ = INVISIBLE;
1011  }
1012  }
1013  /*
1014  * - step 6: clean up miscellaneous punctuation, ala:
1015  *==> perl -pe 's,[-_/]+ , ,g;s/print[_a-zA-Z]* //g;s/ / /g;'
1016  */
1017  for (cp = buf; idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); /*nada*/) {
1018  x = cp + cur.regm.rm_so;
1019  cp += cur.regm.rm_eo - 1; /* leave ' ' alone */
1020  while (x < cp) {
1021  *x++ = ' ';
1022  }
1023  cp++;
1024  }
1025  for (cp = buf; idxGrep(_UTIL_LATEX, cp, REG_ICASE); /*nada*/) {
1026  x = cp + cur.regm.rm_so;
1027  cp += cur.regm.rm_eo;
1028  while (x <= cp) {
1029  *x++ = ' ';
1030  }
1031  cp++;
1032  }
1033  /*
1034  * Ignore function calls to print routines: only concentrate on what's being
1035  * printed (sometimes programs do print licensing information) -- but don't
1036  * ignore real words that END in 'print', like footprint and fingerprint.
1037  * Here, we take a risk and just look for a 't' (in "footprint"), or for an
1038  * 'r' (in "fingerprint"). If someone has ever coded a print routine that
1039  * is named 'rprint' or tprint', we're spoofed.
1040  */
1041  for (cp = buf; idxGrep(_UTIL_PRINT, cp, REG_ICASE); /*nada*/) {
1042  x = cp + cur.regm.rm_so;
1043  cp += (cur.regm.rm_eo - 1);
1044  if ((x > buf) && ((*(x-1) == 'r') || (*(x-1) == 't'))) {
1045  continue;
1046  }
1047  while (x < cp) {
1048  *x++ = ' ';
1049  }
1050  cp++;
1051  }
1052  /*
1053  * Convert the regex ' [X ]+' (where X is really the character #defined as
1054  * INVISIBLE) to a single space (and a string of INVISIBLE characters).
1055  */
1056  for (cp = buf; *cp; /*nada*/) {
1057  if (*cp++ == ' ') {
1058  while (*cp) {
1059  if (*cp == ' ') {
1060  *cp++ = INVISIBLE;
1061  } else if (*cp == INVISIBLE) {
1062  cp++;
1063  } else {
1064  break;
1065  }
1066  }
1067  }
1068  }
1069  /*
1070  * garbage collect: eliminate all INVISIBLE characters in the buffer
1071  */
1072  x = cp = buf;
1073  n = 0;
1074  while (/*cp < end &&*/ *cp) {
1075  while (/*cp < end &&*/ *cp == INVISIBLE) {
1076  n++;
1077  cp++;
1078  }
1079  if (*cp) {
1080  *x++ = *cp++;
1081  }
1082  }
1083  *x = NULL_CHAR;
1084 #ifdef DOCTOR_DEBUG
1085  printf("***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
1086  (int)strlen(buf), n);
1087  printf("+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
1088 #endif /* DOCTOR_DEBUG */
1089  return;
1090 }
1091 #endif
1092 
1093 
1094 
void dehyphen(char *buf)
void removeBackslashesAndGTroffIndicators(char *buf)
Remove groff/troff font-size indicators, the literal string backslash-n and all backslahes,...
void doctorBuffer(char *buf, int isML, int isPS, int isCR)
Convert a buffer of multiple stuff to text-only, separated by spaces.
int compressDoctoredBuffer(char *textBuffer)
garbage collect: eliminate all INVISIBLE characters in the buffer
void convertSpaceToInvisible(char *buf)
void removePunctuation(char *buf)
Clean up miscellaneous punctuation.
void removeLineComments(char *buf)
Remove comments that start at the beginning of a line.
void cleanUpPostscript(char *buf)
Remove newlines from buffer.
void convertWhitespaceToSpaceAndRemoveSpecialChars(char *buf, int isCR)
Convert white-space to real spaces, and remove unnecessary punctuation.
void ignoreFunctionCalls(char *buf)
Ignore function calls to print routines.
void removeHtmlComments(char *buf)
Remove HTML comments from buffer without removing comment text.
Nomos header file.
#define isEOL(x)
Check if x points to a EOL character.
Definition: nomos.h:240
#define NULL_CHAR
NULL character.
Definition: nomos.h:234
GArray * collapseInvisible(char *text, char invisible)
Definition: nomos_gap.c:19
int idxGrep(int index, char *data, int flags)
compile a regex, and perform the search (on data?)
Definition: nomos_regex.c:205