d4/d7c/doctorBuffer__utils_8c_source.html

 /*

  SPDX-FileCopyrightText: © 2006-2014 Hewlett-Packard Development Company, L.P.

  SPDX-FileCopyrightText: © 2014 Siemens AG


  SPDX-License-Identifier: GPL-2.0-only

 */


 #include "doctorBuffer_utils.h"

 #define INVISIBLE       (int) '\377'


 #include "nomos.h"

 #include "list.h"

 #include "util.h"

 #include "nomos_regex.h"


 int compressDoctoredBuffer( char* textBuffer)

 {

   int previous=strlen(textBuffer);


   if(cur.docBufferPositionsAndOffsets)

     g_array_free(cur.docBufferPositionsAndOffsets, TRUE);

   cur.docBufferPositionsAndOffsets=collapseInvisible(textBuffer,INVISIBLE);

 //  cur.docBufferPositionsAndOffsets=collapseSpaces(textBuffer);


   int after = strlen(textBuffer);


   return previous - after;

 }


 void removeHtmlComments(char* buf)

 {

   int f;

   int g;

   char* cp;

   f = 0;

   g = 0;

   for (cp = buf; cp && *cp; cp++)

   {

     if ((*cp == '<') && (*(cp + 1) != '<') && (*(cp + 1) != ' '))

     {

 #if     (DEBUG>5) && defined(DOCTOR_DEBUG)

       int x = strncasecmp(cp, "<string", 7);

       printf("CHECK: %c%c%c%c%c%c%c == %d\n", *cp,

           *(cp+1), *(cp+2), *(cp+3), *(cp+4),

           *(cp+5), *(cp+6), x);

 #endif  /* DEBUG>5 && DOCTOR_DEBUG */

       if (strncasecmp(cp, "<string", 7))

       {

         *cp = ' ';

         if (*(cp + 1) != '-' || *(cp + 2) != '-')

         {

           f = 1;

         }

       }

     }

     else if (*cp == '&')

     {

 #if     (DEBUG>5) && defined(DOCTOR_DEBUG)

       int x = strncasecmp(cp, "&copy;", 6);

       printf("CHECK: %c%c%c%c%c%c == %d\n", *cp,

           *(cp+1), *(cp+2), *(cp+3), *(cp+4),

           *(cp+5), x);

 #endif  /* DEBUG>5 && DOCTOR_DEBUG */

       if (strncasecmp(cp, "&copy;", 6))

       {

         *cp = ' ';

         g = 1;

       }

     }

     else if (f && (*cp == '>'))

     {

       *cp = ' ';

       f = 0;

     }

     else if (g && (*cp == ';'))

     {

       *cp = ' ';

       g = 0;

     }

     else if (isEOL(*cp))

     {

       g = 0;

     }

     /* Don't remove text in an HTML comment (e.g., turn the flag off) */

     else if ((*cp == '!') && f && (cp != buf) && (*(cp - 1) == ' '))

     {

       *cp = ' ';

       f = 0;

     }

     else if (f || g)

     {

       //  *cp = INVISIBLE;   larry comment out this line, I do not think this logic is correct

     }

     else if ((*cp == '<') || (*cp == '>'))

     {

       *cp = ' ';

     }

   }

 }


 void removeLineComments(char* buf)

 {

   char* cp;

   char* MODULE_LICENSE = "MODULE_LICENSE";

   cp = buf;

   while (idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE | REG_NEWLINE | REG_EXTENDED))

   {

 #ifdef  DOCTOR_DEBUG

     dumpMatch(cp, "Found \"comment\"-text");

 #endif  /* DOCTOR_DEBUG */

     cp += cur.regm.rm_so;

     switch (*cp)

     {

     case '>':

       *cp++ = ' ';

       break;

     case '@': /* texi special processing */

       *cp++ = INVISIBLE;

       if (strncasecmp(cp, "author", 6) == 0)

       {

         (void) memset(cp, ' ', 6);

         cp += 6;

       }

       else if (strncasecmp(cp, "comment", 7) == 0)

       {

         (void) memset(cp, ' ', 7);

         cp += 7;

       }

       else if (strncasecmp(cp, "center", 6) == 0)

       {

         (void) memset(cp, ' ', 6);

         cp += 6;

       }

       else if (strncasecmp(cp, "rem", 3) == 0)

       {

         (void) memset(cp, ' ', 3);

         cp += 3;

       }

       else if (*cp == 'c')

       {

         *cp++ = INVISIBLE;

         if (strncasecmp(cp, " essay", 6) == 0)

         {

           (void) memset(cp, ' ', 6);

           cp += 6;

         }

       }

       break;

     case '/': /* c++ style comment // */

       if (cp && cp[0])

       {

         if (strstr(cp, MODULE_LICENSE) && '/' == cp[0])

         {

           (void) memset(cp, INVISIBLE, strlen(cp));

           cp += strlen(cp);

         }

         else

         {

           (void) memset(cp, INVISIBLE, 2);

           cp += 2;

         }

       }

       break;

     case '\\': /* c++ style comment // */

       if (strncasecmp(cp + 1, "par ", 3) == 0)

       {

         (void) memset(cp, ' ', 4);

       }

       cp += 4;

       break;

     case 'r':

     case 'R': /* rem */

     case 'd':

     case 'D': /* dnl */

       (void) memset(cp, INVISIBLE, 3);

       cp += 3;

       break;

     case 'x':

     case 'X': /* xcomm */

       (void) memset(cp, INVISIBLE, 5);

       cp += 5;

       break;

     case 'c':

     case 'C': /* comment */

       (void) memset(cp, INVISIBLE, 7);

       cp += 7;

       break;

     case '%': /* %%copyright: */

       (void) memset(cp, INVISIBLE, 12);

       cp += 12;

       break;

     }

   }

 }


 void cleanUpPostscript(char* buf)

 {

   char* cp;

   char* x;

   cp = buf;

   while (idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED | REG_NEWLINE))

   {

 #ifdef  DOCTOR_DEBUG

     dumpMatch(cp, "FOUND postscript-thingy");

 #endif  /* DOCTOR_DEBUG */

     x = cp + cur.regm.rm_so;

     cp += cur.regm.rm_eo;

     while (x < cp)

     {

       *x++ = ' '/*INVISIBLE*/;

     }

   }

 }


 void removeBackslashesAndGTroffIndicators(char* buf)

 {

   char* cp;

   char* x;

   for (cp = buf; *cp; cp++)

   {

     if (*cp == '\\')

     {

       x = cp + 1;

       if (*x && (*x == 's'))

       {

         x++;

         if (*x && ((*x == '+') || (*x == '-')))

         {

           x++;

         }

         while (*x && isdigit(*x))

         {

           x++;

         }

       }

       else if (*x && *x == 'n')

       {

         x++;

       }

       memset(cp, /*INVISIBLE*/' ', (size_t) (x - cp));

     }

   }

 }


 void convertWhitespaceToSpaceAndRemoveSpecialChars(char* buf, int isCR )

 {

   char* cp;

   for (cp = buf; /*cp < end &&*/*cp; cp++)

   {

     if ((*cp == '\302') && (*(cp + 1) == '\251'))

     {

       cp += 1;

       continue;

     }

     if (*cp & (char) 0x80)

     {

       *cp = INVISIBLE;

       continue;

     }

     switch (*cp)

     {

     /*

      Convert eol-characters AND some other miscellaneous

      characters into spaces (due to comment-styles, etc.)

      */

     case '\a':

     case '\t':

     case '\n':

     case '\r':

     case '\v':

     case '\f':

     case '[':

     case ']':

     case '{':

     case '}':

     case '*':

     case '=':

     case '#':

     case '$':

     case '|':

     case '%':

     case '!':

     case '?':

     case '`':

     case '"':

     case '\'':

       *cp = ' ';

       break;

       /* allow + only within the regex " [Mm]\+ " */

     case '+':

       if (*(cp + 1) == 0 || *(cp + 1) == ' ' || *(cp + 1) == '\t' || *(cp + 1) == '\n' || *(cp + 1) == '\r')

         break;

       else if (cp > buf + 1 && (*(cp - 1) == 'M' || *(cp - 1) == 'm') && *(cp - 2) == ' ' && *(cp + 1) == ' ')

       {

         /* no-op */

       }

       else

       {

         *cp = ' ';

       }

       break;

     case '(':

       if ((*(cp + 1) == 'C' || *(cp + 1) == 'c') && *(cp + 2) == ')')

       {

         cp += 2;

         continue;

       }

       else

       {

         *cp = ' ';

       }

       break;

     case ')':

     case ',':

     case ':':

     case ';':

       if (!isCR)

       {

         *cp = ' ';

       }

       break;

     case '.':

       if (!isCR)

       {

         *cp = INVISIBLE;

       }

       break;

     case '<':

       if (strncasecmp(cp, "<string", 7) == 0)

       {

         (void) memcpy(cp, "       ", 7 * sizeof(char));

       }

       break;

       /* CDB - Big #ifdef 0 left out */

     case '\001':

     case '\002':

     case '\003':

     case '\004':

     case '\005':

     case '\006':

     case '\016':

     case '\017':

     case '\020':

     case '\021':

     case '\022':

     case '\023':

     case '\024':

     case '\025':

     case '\026':

     case '\027':

     case '\030':

     case '\031':

     case '\032':

     case '\033':

     case '\034':

     case '\035':

     case '\036':

     case '\037':

     case '~':

       *cp = INVISIBLE;

       break;

 #ifdef  DOCTOR_DEBUG

       case ' ': case '/': case '-': case '@': case '&':

       case '>': case '^': case '_':

       case INVISIBLE:

       break;

       default:

       if (!isalpha(*cp) && !isdigit(*cp))

       {

         printf("DEBUG: \\0%o @ %ld\n",

             *cp & 0xff, cp-buf);

       }

       break;

 #endif  /* DOCTOR_DEBUG */

     }

   }

 }


 void dehyphen(char* buf)

 {

   char* cp;


   for (cp = buf; idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); /*nada*/)

   {

 #ifdef  DOCTOR_DEBUG

     char* x;

     x = cp + cur.regm.rm_so;

     while ((x > cp) && !isspace(*x))

     {

       x--;

     }

     printf("Hey! hyphenated-word [");

     for (++x; x <= (cp + cur.regm.rm_eo); x++)

     {

       printf("%c", *x);

     }

     while (!isspace(*x))

     {

       printf("%c", *x++);

     }

     printf("]\n");


 #endif  /* DOCTOR_DEBUG */

     cp += cur.regm.rm_so + 1;

     *cp++ = INVISIBLE;

     while (isspace(*cp))

     {

       *cp++ = INVISIBLE;

     }

   }


 }


 void removePunctuation(char* buf)

 {

   char* cp;

   char* x;

   for (cp = buf; idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); /*nada*/)

   {

     x = cp + cur.regm.rm_so;

     cp += cur.regm.rm_eo - 1; /* leave ' ' alone */

     while (x < cp)

     {

       *x++ = ' ';

     }

     cp++;

   }

   for (cp = buf; idxGrep(_UTIL_LATEX, cp, REG_ICASE); /*nada*/)

   {

     x = cp + cur.regm.rm_so;

     cp += cur.regm.rm_eo;

     while (x <= cp)

     {

       *x++ = ' ';

     }

     cp++;

   }

 }


 void ignoreFunctionCalls(char* buf)

 {

   char* cp;

   char* x;

   for (cp = buf; idxGrep(_UTIL_PRINT, cp, REG_ICASE); /*nada*/)

   {

     x = cp + cur.regm.rm_so;

     cp += (cur.regm.rm_eo - 1);

     if ((x > buf) && ((*(x - 1) == 'r') || (*(x - 1) == 't')))

     {

       continue;

     }

     while (x < cp)

     {

       *x++ = ' ';

     }

     cp++;

   }

 }


 void convertSpaceToInvisible(char* buf)

 {

   char* cp;

   for (cp = buf; *cp; /*nada*/)

   {

     if (*cp++ == ' ')

     {

       while (*cp)

       {

         if (*cp == ' ')

         {

           *cp++ = INVISIBLE;

         }

         else if (*cp == INVISIBLE)

         {

           cp++;

         }

         else

         {

           break;

         }

       }

     }

   }

 }


 void doctorBuffer(char *buf, int isML, int isPS, int isCR)

 {


 //  printf("\n ==============doctorBuffer is called============================== \n");


  // char *cp;

  // char *x;

 #if     defined(PROC_TRACE) || defined(DOCTOR_DEBUG)

   traceFunc("== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);

 #endif  /* PROC_TRACE || DOCTOR_DEBUG */


   /*

    * convert a buffer of multiple *stuff* to text-only, separated by spaces

    * We really only care about text "in a license" here, so strip out

    * comments and other unwanted punctuation.

    */

 #ifdef  DOCTOR_DEBUG

   printf("***** Processing %p (%d data bytes)\n", buf, (int)strlen(buf));

   printf("----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);

 #endif  /* DOCTOR_DEBUG */

   /*

    * step 1: take care of embedded HTML/XML and special HTML-chars like

    * &quot; and &nbsp; -- but DON'T remove the text in an HTML comment.

    * There might be licensing text/information in the comment!

    *****

    * Later on (in parseLicenses()) we search for URLs in the raw-text

    */

   if (isML)

   {

 #ifdef  DOCTOR_DEBUG

     printf("DEBUG: markup-languange directives found!\n");

 #endif  /* DOCTOR_DEBUG */

     removeHtmlComments(buf);

   }

   /*

    * step 2: remove comments that start at the beginning of a line, * like

    * ^dnl, ^xcomm, ^comment, and //

    */

    removeLineComments(buf);

   /*

    * Step 3 - strip out crap at end-of-line on postscript documents

    */


   if (isPS)

   {

     cleanUpPostscript(buf);

 #ifdef  DOCTOR_DEBUG

     printf("DEBUG: postscript stuff detected!\n");

 #endif  /* DOCTOR_DEBUG */

   }

   /*

    *      - step 4: remove groff/troff font-size indicators, the literal

    *              string backslash-n and all backslahes, ala:

    *==>   perl -pe 's,\\s[+-][0-9]*,,g;s,\\s[0-9]*,,g;s/\\n//g;' |

    f*/

    removeBackslashesAndGTroffIndicators(buf);

   /*

    *      - step 5: convert white-space to real spaces, and remove

    *              unnecessary punctuation, ala:

    *==>   tr -d '*=+#$|%.,:;!?()\\][\140\047\042' | tr '\011\012\015' '   '

    *****

    * NOTE: we purposely do NOT process backspace-characters here.  Perhaps

    * there's an improvement in the wings for this?

    */

   convertWhitespaceToSpaceAndRemoveSpecialChars(buf, isCR);

   /*

    * Look for hyphenations of words, to compress both halves into a sin-

    * gle (sic) word.  Regex == "[a-z]- [a-z]".

    *****

    * NOTE: not sure this will work based on the way we strip punctuation

    * out of the buffer above -- work on this later.

    */

    dehyphen(buf);

   /*

    *      - step 6: clean up miscellaneous punctuation, ala:

    *==>           perl -pe 's,[-_/]+ , ,g;s/print[_a-zA-Z]* //g;s/  / /g;'

    */

    removePunctuation(buf);

   /*

    * Ignore function calls to print routines: only concentrate on what's being

    * printed (sometimes programs do print licensing information) -- but don't

    * ignore real words that END in 'print', like footprint and fingerprint.

    * Here, we take a risk and just look for a 't' (in "footprint"), or for an

    * 'r' (in "fingerprint").  If someone has ever coded a print routine that

    * is named 'rprint' or tprint', we're spoofed.

    */

    ignoreFunctionCalls(buf);

   /*

    * Convert the regex ' [X ]+' (where X is really the character #defined as

    * INVISIBLE) to a single space (and a string of INVISIBLE characters).

    */

   convertSpaceToInvisible(buf);

   /*

    * garbage collect: eliminate all INVISIBLE characters in the buffer

    */

 #ifdef  DOCTOR_DEBUG

   int n =

 #else

       (void)

 #endif

       compressDoctoredBuffer(buf);


 #ifdef  DOCTOR_DEBUG

   printf("***** Now buffer %p contains %d bytes (%d clipped)\n", buf,

       (int)strlen(buf), n);

   printf("+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);

 #endif  /* DOCTOR_DEBUG */

   return;

 }


 #ifdef DOCTORBUFFER_OLD

 void doctorBuffer_old(char *buf, int isML, int isPS, int isCR)

 {

   printf("Doctor Buffer old \n");

   char *cp;

   char *x;

   int f;

   int g;

   int n;

   char *MODULE_LICENSE = "MODULE_LICENSE";


 #if     defined(PROC_TRACE) || defined(DOCTOR_DEBUG)

   traceFunc("== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);

 #endif  /* PROC_TRACE || DOCTOR_DEBUG */


   /*

    * convert a buffer of multiple *stuff* to text-only, separated by spaces

    * We really only care about text "in a license" here, so strip out

    * comments and other unwanted punctuation.

    */

 #ifdef  DOCTOR_DEBUG

   printf("***** Processing %p (%d data bytes)\n", buf, (int)strlen(buf));

   printf("----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);

 #endif  /* DOCTOR_DEBUG */

   /*

    * step 1: take care of embedded HTML/XML and special HTML-chars like

    * &quot; and &nbsp; -- but DON'T remove the text in an HTML comment.

    * There might be licensing text/information in the comment!

    *****

    * Later on (in parseLicenses()) we search for URLs in the raw-text

    */

   if (isML) {

 #ifdef  DOCTOR_DEBUG

     printf("DEBUG: markup-languange directives found!\n");

 #endif  /* DOCTOR_DEBUG */

     f = 0;

     g = 0;

     for (cp = buf; cp && *cp; cp++) {

       if ((*cp == '<') &&

           (*(cp+1) != '<') &&

           (*(cp+1) != ' ')) {

 #if     (DEBUG>5) && defined(DOCTOR_DEBUG)

         int x = strncasecmp(cp, "<string", 7);

         printf("CHECK: %c%c%c%c%c%c%c == %d\n", *cp,

             *(cp+1), *(cp+2), *(cp+3), *(cp+4),

             *(cp+5), *(cp+6), x);

 #endif  /* DEBUG>5 && DOCTOR_DEBUG */

         if (strncasecmp(cp, "<string", 7)) {

           *cp = ' ';

           if (*(cp+1) != '-' || *(cp+2) != '-') {

             f = 1;

           }

         }

       } else if (*cp == '&') {

 #if     (DEBUG>5) && defined(DOCTOR_DEBUG)

         int x = strncasecmp(cp, "&copy;", 6);

         printf("CHECK: %c%c%c%c%c%c == %d\n", *cp,

             *(cp+1), *(cp+2), *(cp+3), *(cp+4),

             *(cp+5), x);

 #endif  /* DEBUG>5 && DOCTOR_DEBUG */

         if (strncasecmp(cp, "&copy;", 6)) {

           *cp = ' ';

           g = 1;

         }

       } else if (f && (*cp == '>')) {

         *cp = ' ';

         f = 0;

       } else if (g && (*cp == ';')) {

         *cp = ' ';

         g = 0;

       } else if (isEOL(*cp)) {

         g = 0;

       }

       /* Don't remove text in an HTML comment (e.g., turn the flag off) */

       else if ((*cp == '!') &&

           f &&

           (cp != buf) &&

           (*(cp-1) == ' ')) {

         *cp = ' ';

         f = 0;

       } else if (f || g) {

       //  *cp = INVISIBLE;   larry comment out this line, I do not think this logic is correct

       } else if ((*cp == '<') || (*cp == '>')) {

         *cp = ' ';

       }

     }

   }

   /*

    * step 2: remove comments that start at the beginning of a line, * like

    * ^dnl, ^xcomm, ^comment, and //

    */

   cp = buf;

   while (idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE|REG_NEWLINE|REG_EXTENDED)) {

 #ifdef  DOCTOR_DEBUG

     dumpMatch(cp, "Found \"comment\"-text");

 #endif  /* DOCTOR_DEBUG */

     cp += cur.regm.rm_so;

     switch (*cp) {

       case '>':

         *cp++ = ' ';

         break;

       case '@':       /* texi special processing */

         *cp++ = INVISIBLE;

         if (strncasecmp(cp, "author", 6) == 0) {

           (void) memset(cp, ' ', 6);

           cp += 6;

         } else if (strncasecmp(cp, "comment", 7) == 0) {

           (void) memset(cp, ' ', 7);

           cp += 7;

         } else if (strncasecmp(cp, "center", 6) == 0) {

           (void) memset(cp, ' ', 6);

           cp += 6;

         }

         else if (strncasecmp(cp, "rem", 3) == 0) {

           (void) memset(cp, ' ', 3);

           cp += 3;

         } else if (*cp == 'c') {

           *cp++ = INVISIBLE;

           if (strncasecmp(cp, " essay", 6) == 0) {

             (void) memset(cp, ' ', 6);

             cp += 6;

           }

         }

         break;

       case '/':       /* c++ style comment // */

         if(cp && cp[0])

         {

           if (strstr(cp, MODULE_LICENSE) && '/' == cp[0])

           {

             (void) memset(cp, INVISIBLE, strlen(cp));

             cp += strlen(cp);

           }

           else {

             (void) memset(cp, INVISIBLE, 2);

             cp += 2;

           }

         }

         break;

       case '\\':      /* c++ style comment // */

         if (strncasecmp(cp+1, "par ", 3) == 0) {

           (void) memset(cp, ' ', 4);

         }

         cp += 4;

         break;

       case 'r':

       case 'R':       /* rem */

       case 'd':

       case 'D':       /* dnl */

         (void) memset(cp, INVISIBLE, 3);

         cp += 3;

         break;

       case 'x':

       case 'X':       /* xcomm */

         (void) memset(cp, INVISIBLE, 5);

         cp += 5;

         break;

       case 'c':

       case 'C':       /* comment */

         (void) memset(cp, INVISIBLE, 7);

         cp += 7;

         break;

       case '%':       /* %%copyright: */

         (void) memset(cp, INVISIBLE, 12);

         cp += 12;

         break;

     }

   }

   /*

    * Step 3 - strip out crap at end-of-line on postscript documents

    */

   if (isPS) {

 #ifdef  DOCTOR_DEBUG

     printf("DEBUG: postscript stuff detected!\n");

 #endif  /* DOCTOR_DEBUG */

     cp = buf;

     while (idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED|REG_NEWLINE)) {

 #ifdef  DOCTOR_DEBUG

       dumpMatch(cp, "FOUND postscript-thingy");

 #endif  /* DOCTOR_DEBUG */

       x = cp + cur.regm.rm_so;

       cp += cur.regm.rm_eo;

       while (x < cp) {

         *x++ = ' '/*INVISIBLE*/;

       }

     }

   }

   /*

    *      - step 4: remove groff/troff font-size indicators, the literal

    *              string backslash-n and all backslahes, ala:

    *==>   perl -pe 's,\\s[+-][0-9]*,,g;s,\\s[0-9]*,,g;s/\\n//g;' |

      f*/

   for (cp = buf; *cp; cp++) {

     if (*cp == '\\') {

       x = cp + 1;

       if (*x && (*x == 's')) {

         x++;

         if (*x && ((*x == '+') || (*x == '-'))) {

           x++;

         }

         while (*x && isdigit(*x)) {

           x++;

         }

       } else if (*x && *x == 'n') {

         x++;

       }

       memset(cp, /*INVISIBLE*/' ', (size_t) (x-cp));

     }

   }

   /*

    *      - step 5: convert white-space to real spaces, and remove

    *              unnecessary punctuation, ala:

    *==>   tr -d '*=+#$|%.,:;!?()\\][\140\047\042' | tr '\011\012\015' '   '

    *****

    * NOTE: we purposely do NOT process backspace-characters here.  Perhaps

    * there's an improvement in the wings for this?

    */

   for (cp = buf; /*cp < end &&*/ *cp; cp++) {

     if ((*cp == '\302') && (*(cp+1) == '\251')) {

       cp += 2;

       continue;

     }

     if (*cp & (char) 0x80) {

       *cp = INVISIBLE;

       continue;

     }

     switch (*cp) {

       /*

         Convert eol-characters AND some other miscellaneous

         characters into spaces (due to comment-styles, etc.)

        */

       case '\a': case '\t': case '\n': case '\r':

       case '\v': case '\f': case '[': case ']':

       case '{': case '}': case '*': case '=':

       case '#': case '$': case '|': case '%': case '!':

       case '?': case '`': case '"': case '\'':

         *cp = ' ';

         break;

         /* allow + only within the regex " [Mm]\+ " */

       case '+':

         if (*(cp+1) == 0 || *(cp+1) == ' ' || *(cp+1) == '\t' || *(cp+1) == '\n' || *(cp+1) == '\r') break;

         else if (cp > buf+1 && (*(cp-1) == 'M' ||

             *(cp-1) == 'm') && *(cp-2) == ' ' &&

             *(cp+1) == ' ') {

           f = 0; /* no-op */

         }

         else {

           *cp = ' ';

         }

         break;

       case '(':

         if ((*(cp+1) == 'C' || *(cp+1) == 'c') &&

             *(cp+2) == ')') {

           cp += 2;

           continue;

         }

         else {

           *cp = ' ';

         }

         break;

       case ')': case ',': case ':': case ';':

         if (!isCR) {

           *cp = ' ';

         }

         break;

       case '.':

         if (!isCR) {

           *cp = INVISIBLE;

         }

         break;

       case '<':

         if (strncasecmp(cp, "<string", 7) == 0) {

           (void) strncpy(cp, "          ", 7);

         }

         break;

         /* CDB - Big #ifdef 0 left out */

       case '\001': case '\002': case '\003': case '\004':

       case '\005': case '\006': case '\016': case '\017':

       case '\020': case '\021': case '\022': case '\023':

       case '\024': case '\025': case '\026': case '\027':

       case '\030': case '\031': case '\032': case '\033':

       case '\034': case '\035': case '\036': case '\037':

       case '~':

         *cp = INVISIBLE;

         break;

 #ifdef  DOCTOR_DEBUG

       case ' ': case '/': case '-': case '@': case '&':

       case '>': case '^': case '_':

       case INVISIBLE:

         break;

       default:

         if (!isalpha(*cp) && !isdigit(*cp)) {

           printf("DEBUG: \\0%o @ %ld\n",

               *cp & 0xff, cp-buf);

         }

         break;

 #endif  /* DOCTOR_DEBUG */

     }

   }

   /*

    * Look for hyphenations of words, to compress both halves into a sin-

    * gle (sic) word.  Regex == "[a-z]- [a-z]".

    *****

    * NOTE: not sure this will work based on the way we strip punctuation

    * out of the buffer above -- work on this later.

    */

   for (cp = buf; idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); /*nada*/) {

 #ifdef  DOCTOR_DEBUG

     x = cp + cur.regm.rm_so;

     while ((x > cp) && !isspace(*x)) {

       x--;

     }

     printf("Hey! hyphenated-word [");

     for (++x; x <= (cp + cur.regm.rm_eo); x++) {

       printf("%c", *x);

     }

     while (!isspace(*x)) {

       printf("%c", *x++);

     }

     printf("]\n");


 #endif  /* DOCTOR_DEBUG */

     cp += cur.regm.rm_so + 1;

     *cp++ = INVISIBLE;

     while (isspace(*cp)) {

       *cp++ = INVISIBLE;

     }

   }

   /*

    *      - step 6: clean up miscellaneous punctuation, ala:

    *==>           perl -pe 's,[-_/]+ , ,g;s/print[_a-zA-Z]* //g;s/  / /g;'

    */

   for (cp = buf; idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); /*nada*/) {

     x = cp + cur.regm.rm_so;

     cp += cur.regm.rm_eo - 1;  /* leave ' ' alone */

     while (x < cp) {

       *x++ = ' ';

     }

     cp++;

   }

   for (cp = buf; idxGrep(_UTIL_LATEX, cp, REG_ICASE); /*nada*/) {

     x = cp + cur.regm.rm_so;

     cp += cur.regm.rm_eo;

     while (x <= cp) {

       *x++ = ' ';

     }

     cp++;

   }

   /*

    * Ignore function calls to print routines: only concentrate on what's being

    * printed (sometimes programs do print licensing information) -- but don't

    * ignore real words that END in 'print', like footprint and fingerprint.

    * Here, we take a risk and just look for a 't' (in "footprint"), or for an

    * 'r' (in "fingerprint").  If someone has ever coded a print routine that

    * is named 'rprint' or tprint', we're spoofed.

    */

   for (cp = buf; idxGrep(_UTIL_PRINT, cp, REG_ICASE); /*nada*/) {

     x = cp + cur.regm.rm_so;

     cp += (cur.regm.rm_eo - 1);

     if ((x > buf) && ((*(x-1) == 'r') || (*(x-1) == 't'))) {

       continue;

     }

     while (x < cp) {

       *x++ = ' ';

     }

     cp++;

   }

   /*

    * Convert the regex ' [X ]+' (where X is really the character #defined as

    * INVISIBLE) to a single space (and a string of INVISIBLE characters).

    */

   for (cp = buf; *cp; /*nada*/) {

     if (*cp++ == ' ') {

       while (*cp) {

         if (*cp == ' ') {

           *cp++ = INVISIBLE;

         } else if (*cp == INVISIBLE) {

           cp++;

         } else {

           break;

         }

       }

     }

   }

   /*

    * garbage collect: eliminate all INVISIBLE characters in the buffer

    */

   x = cp = buf;

   n = 0;

   while (/*cp < end &&*/ *cp) {

     while (/*cp < end &&*/ *cp == INVISIBLE) {

       n++;

       cp++;

     }

     if (*cp) {

       *x++ = *cp++;

     }

   }

   *x = NULL_CHAR;

 #ifdef  DOCTOR_DEBUG

   printf("***** Now buffer %p contains %d bytes (%d clipped)\n", buf,

       (int)strlen(buf), n);

   printf("+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);

 #endif  /* DOCTOR_DEBUG */

   return;

 }

 #endif


dehyphen
void dehyphen(char *buf)
Definition: doctorBuffer_utils.c:426

removeBackslashesAndGTroffIndicators
void removeBackslashesAndGTroffIndicators(char *buf)
Remove groff/troff font-size indicators, the literal string backslash-n and all backslahes,...
Definition: doctorBuffer_utils.c:246

doctorBuffer
void doctorBuffer(char *buf, int isML, int isPS, int isCR)
Convert a buffer of multiple stuff to text-only, separated by spaces.
Definition: doctorBuffer_utils.c:575

compressDoctoredBuffer
int compressDoctoredBuffer(char *textBuffer)
garbage collect: eliminate all INVISIBLE characters in the buffer
Definition: doctorBuffer_utils.c:25

convertSpaceToInvisible
void convertSpaceToInvisible(char *buf)
Definition: doctorBuffer_utils.c:530

removePunctuation
void removePunctuation(char *buf)
Clean up miscellaneous punctuation.
Definition: doctorBuffer_utils.c:467

removeLineComments
void removeLineComments(char *buf)
Remove comments that start at the beginning of a line.
Definition: doctorBuffer_utils.c:120

cleanUpPostscript
void cleanUpPostscript(char *buf)
Remove newlines from buffer.
Definition: doctorBuffer_utils.c:220

convertWhitespaceToSpaceAndRemoveSpecialChars
void convertWhitespaceToSpaceAndRemoveSpecialChars(char *buf, int isCR)
Convert white-space to real spaces, and remove unnecessary punctuation.
Definition: doctorBuffer_utils.c:285

ignoreFunctionCalls
void ignoreFunctionCalls(char *buf)
Ignore function calls to print routines.
Definition: doctorBuffer_utils.c:505

removeHtmlComments
void removeHtmlComments(char *buf)
Remove HTML comments from buffer without removing comment text.
Definition: doctorBuffer_utils.c:43

nomos.h
Nomos header file.

isEOL
#define isEOL(x)
Check if x points to a EOL character.
Definition: nomos.h:240

NULL_CHAR
#define NULL_CHAR
NULL character.
Definition: nomos.h:234

collapseInvisible
GArray * collapseInvisible(char *text, char invisible)
Definition: nomos_gap.c:19

idxGrep
int idxGrep(int index, char *data, int flags)
compile a regex, and perform the search (on data?)
Definition: nomos_regex.c:205