8 #include "doctorBuffer_utils.h"
9 #define INVISIBLE (int) '\377'
14 #include "nomos_regex.h"
27 int previous=strlen(textBuffer);
29 if(cur.docBufferPositionsAndOffsets)
30 g_array_free(cur.docBufferPositionsAndOffsets, TRUE);
34 int after = strlen(textBuffer);
36 return previous - after;
50 for (cp = buf; cp && *cp; cp++)
52 if ((*cp ==
'<') && (*(cp + 1) !=
'<') && (*(cp + 1) !=
' '))
54 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
55 int x = strncasecmp(cp,
"<string", 7);
56 printf(
"CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
57 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
60 if (strncasecmp(cp,
"<string", 7))
63 if (*(cp + 1) !=
'-' || *(cp + 2) !=
'-')
71 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
72 int x = strncasecmp(cp,
"©", 6);
73 printf(
"CHECK: %c%c%c%c%c%c == %d\n", *cp,
74 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
77 if (strncasecmp(cp,
"©", 6))
83 else if (f && (*cp ==
'>'))
88 else if (g && (*cp ==
';'))
98 else if ((*cp ==
'!') && f && (cp != buf) && (*(cp - 1) ==
' '))
107 else if ((*cp ==
'<') || (*cp ==
'>'))
123 char* MODULE_LICENSE =
"MODULE_LICENSE";
125 while (
idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE | REG_NEWLINE | REG_EXTENDED))
128 dumpMatch(cp,
"Found \"comment\"-text");
130 cp += cur.regm.rm_so;
138 if (strncasecmp(cp,
"author", 6) == 0)
140 (void) memset(cp,
' ', 6);
143 else if (strncasecmp(cp,
"comment", 7) == 0)
145 (void) memset(cp,
' ', 7);
148 else if (strncasecmp(cp,
"center", 6) == 0)
150 (void) memset(cp,
' ', 6);
153 else if (strncasecmp(cp,
"rem", 3) == 0)
155 (void) memset(cp,
' ', 3);
161 if (strncasecmp(cp,
" essay", 6) == 0)
163 (void) memset(cp,
' ', 6);
172 if (strstr(cp, MODULE_LICENSE) &&
'/' == cp[0])
174 (void) memset(cp, INVISIBLE, strlen(cp));
179 (void) memset(cp, INVISIBLE, 2);
185 if (strncasecmp(cp + 1,
"par ", 3) == 0)
187 (void) memset(cp,
' ', 4);
195 (void) memset(cp, INVISIBLE, 3);
200 (void) memset(cp, INVISIBLE, 5);
205 (void) memset(cp, INVISIBLE, 7);
209 (void) memset(cp, INVISIBLE, 12);
225 while (
idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED | REG_NEWLINE))
228 dumpMatch(cp,
"FOUND postscript-thingy");
230 x = cp + cur.regm.rm_so;
231 cp += cur.regm.rm_eo;
250 for (cp = buf; *cp; cp++)
255 if (*x && (*x ==
's'))
258 if (*x && ((*x ==
'+') || (*x ==
'-')))
262 while (*x && isdigit(*x))
267 else if (*x && *x ==
'n')
271 memset(cp,
' ', (
size_t) (x - cp));
288 for (cp = buf; *cp; cp++)
290 if ((*cp ==
'\302') && (*(cp + 1) ==
'\251'))
295 if (*cp & (
char) 0x80)
331 if (*(cp + 1) == 0 || *(cp + 1) ==
' ' || *(cp + 1) ==
'\t' || *(cp + 1) ==
'\n' || *(cp + 1) ==
'\r')
333 else if (cp > buf + 1 && (*(cp - 1) ==
'M' || *(cp - 1) ==
'm') && *(cp - 2) ==
' ' && *(cp + 1) ==
' ')
343 if ((*(cp + 1) ==
'C' || *(cp + 1) ==
'c') && *(cp + 2) ==
')')
369 if (strncasecmp(cp,
"<string", 7) == 0)
371 (void) memcpy(cp,
" ", 7 *
sizeof(
char));
403 case ' ':
case '/':
case '-':
case '@':
case '&':
404 case '>':
case '^':
case '_':
408 if (!isalpha(*cp) && !isdigit(*cp))
410 printf(
"DEBUG: \\0%o @ %ld\n",
430 for (cp = buf;
idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); )
434 x = cp + cur.regm.rm_so;
435 while ((x > cp) && !isspace(*x))
439 printf(
"Hey! hyphenated-word [");
440 for (++x; x <= (cp + cur.regm.rm_eo); x++)
451 cp += cur.regm.rm_so + 1;
471 for (cp = buf;
idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); )
473 x = cp + cur.regm.rm_so;
474 cp += cur.regm.rm_eo - 1;
481 for (cp = buf;
idxGrep(_UTIL_LATEX, cp, REG_ICASE); )
483 x = cp + cur.regm.rm_so;
484 cp += cur.regm.rm_eo;
509 for (cp = buf;
idxGrep(_UTIL_PRINT, cp, REG_ICASE); )
511 x = cp + cur.regm.rm_so;
512 cp += (cur.regm.rm_eo - 1);
513 if ((x > buf) && ((*(x - 1) ==
'r') || (*(x - 1) ==
't')))
533 for (cp = buf; *cp; )
543 else if (*cp == INVISIBLE)
582 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG)
583 traceFunc(
"== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
592 printf(
"***** Processing %p (%d data bytes)\n", buf, (
int)strlen(buf));
593 printf(
"----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
605 printf(
"DEBUG: markup-languange directives found!\n");
622 printf(
"DEBUG: postscript stuff detected!\n");
678 printf(
"***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
679 (
int)strlen(buf), n);
680 printf(
"+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
685 #ifdef DOCTORBUFFER_OLD
686 void doctorBuffer_old(
char *buf,
int isML,
int isPS,
int isCR)
688 printf(
"Doctor Buffer old \n");
694 char *MODULE_LICENSE =
"MODULE_LICENSE";
696 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG)
697 traceFunc(
"== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
706 printf(
"***** Processing %p (%d data bytes)\n", buf, (
int)strlen(buf));
707 printf(
"----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
718 printf(
"DEBUG: markup-languange directives found!\n");
722 for (cp = buf; cp && *cp; cp++) {
726 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
727 int x = strncasecmp(cp,
"<string", 7);
728 printf(
"CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
729 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
730 *(cp+5), *(cp+6), x);
732 if (strncasecmp(cp,
"<string", 7)) {
734 if (*(cp+1) !=
'-' || *(cp+2) !=
'-') {
738 }
else if (*cp ==
'&') {
739 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
740 int x = strncasecmp(cp,
"©", 6);
741 printf(
"CHECK: %c%c%c%c%c%c == %d\n", *cp,
742 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
745 if (strncasecmp(cp,
"©", 6)) {
749 }
else if (f && (*cp ==
'>')) {
752 }
else if (g && (*cp ==
';')) {
755 }
else if (
isEOL(*cp)) {
759 else if ((*cp ==
'!') &&
767 }
else if ((*cp ==
'<') || (*cp ==
'>')) {
777 while (
idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE|REG_NEWLINE|REG_EXTENDED)) {
779 dumpMatch(cp,
"Found \"comment\"-text");
781 cp += cur.regm.rm_so;
788 if (strncasecmp(cp,
"author", 6) == 0) {
789 (void) memset(cp,
' ', 6);
791 }
else if (strncasecmp(cp,
"comment", 7) == 0) {
792 (void) memset(cp,
' ', 7);
794 }
else if (strncasecmp(cp,
"center", 6) == 0) {
795 (void) memset(cp,
' ', 6);
798 else if (strncasecmp(cp,
"rem", 3) == 0) {
799 (void) memset(cp,
' ', 3);
801 }
else if (*cp ==
'c') {
803 if (strncasecmp(cp,
" essay", 6) == 0) {
804 (void) memset(cp,
' ', 6);
813 if (strstr(cp, MODULE_LICENSE) &&
'/' == cp[0])
815 (void) memset(cp, INVISIBLE, strlen(cp));
819 (void) memset(cp, INVISIBLE, 2);
825 if (strncasecmp(cp+1,
"par ", 3) == 0) {
826 (void) memset(cp,
' ', 4);
834 (void) memset(cp, INVISIBLE, 3);
839 (void) memset(cp, INVISIBLE, 5);
844 (void) memset(cp, INVISIBLE, 7);
848 (void) memset(cp, INVISIBLE, 12);
858 printf(
"DEBUG: postscript stuff detected!\n");
861 while (
idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED|REG_NEWLINE)) {
863 dumpMatch(cp,
"FOUND postscript-thingy");
865 x = cp + cur.regm.rm_so;
866 cp += cur.regm.rm_eo;
877 for (cp = buf; *cp; cp++) {
880 if (*x && (*x ==
's')) {
882 if (*x && ((*x ==
'+') || (*x ==
'-'))) {
885 while (*x && isdigit(*x)) {
888 }
else if (*x && *x ==
'n') {
891 memset(cp,
' ', (
size_t) (x-cp));
902 for (cp = buf; *cp; cp++) {
903 if ((*cp ==
'\302') && (*(cp+1) ==
'\251')) {
907 if (*cp & (
char) 0x80) {
916 case '\a':
case '\t':
case '\n':
case '\r':
917 case '\v':
case '\f':
case '[':
case ']':
918 case '{':
case '}':
case '*':
case '=':
919 case '#':
case '$':
case '|':
case '%':
case '!':
920 case '?':
case '`':
case '"':
case '\'':
925 if (*(cp+1) == 0 || *(cp+1) ==
' ' || *(cp+1) ==
'\t' || *(cp+1) ==
'\n' || *(cp+1) ==
'\r')
break;
926 else if (cp > buf+1 && (*(cp-1) ==
'M' ||
927 *(cp-1) ==
'm') && *(cp-2) ==
' ' &&
936 if ((*(cp+1) ==
'C' || *(cp+1) ==
'c') &&
945 case ')':
case ',':
case ':':
case ';':
956 if (strncasecmp(cp,
"<string", 7) == 0) {
957 (void) strncpy(cp,
" ", 7);
961 case '\001':
case '\002':
case '\003':
case '\004':
962 case '\005':
case '\006':
case '\016':
case '\017':
963 case '\020':
case '\021':
case '\022':
case '\023':
964 case '\024':
case '\025':
case '\026':
case '\027':
965 case '\030':
case '\031':
case '\032':
case '\033':
966 case '\034':
case '\035':
case '\036':
case '\037':
971 case ' ':
case '/':
case '-':
case '@':
case '&':
972 case '>':
case '^':
case '_':
976 if (!isalpha(*cp) && !isdigit(*cp)) {
977 printf(
"DEBUG: \\0%o @ %ld\n",
991 for (cp = buf;
idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); ) {
993 x = cp + cur.regm.rm_so;
994 while ((x > cp) && !isspace(*x)) {
997 printf(
"Hey! hyphenated-word [");
998 for (++x; x <= (cp + cur.regm.rm_eo); x++) {
1001 while (!isspace(*x)) {
1007 cp += cur.regm.rm_so + 1;
1009 while (isspace(*cp)) {
1017 for (cp = buf;
idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); ) {
1018 x = cp + cur.regm.rm_so;
1019 cp += cur.regm.rm_eo - 1;
1025 for (cp = buf;
idxGrep(_UTIL_LATEX, cp, REG_ICASE); ) {
1026 x = cp + cur.regm.rm_so;
1027 cp += cur.regm.rm_eo;
1041 for (cp = buf;
idxGrep(_UTIL_PRINT, cp, REG_ICASE); ) {
1042 x = cp + cur.regm.rm_so;
1043 cp += (cur.regm.rm_eo - 1);
1044 if ((x > buf) && ((*(x-1) ==
'r') || (*(x-1) ==
't'))) {
1056 for (cp = buf; *cp; ) {
1061 }
else if (*cp == INVISIBLE) {
1075 while ( *cp == INVISIBLE) {
1085 printf(
"***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
1086 (
int)strlen(buf), n);
1087 printf(
"+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
void removeBackslashesAndGTroffIndicators(char *buf)
Remove groff/troff font-size indicators, the literal string backslash-n and all backslahes,...
void doctorBuffer(char *buf, int isML, int isPS, int isCR)
Convert a buffer of multiple stuff to text-only, separated by spaces.
int compressDoctoredBuffer(char *textBuffer)
garbage collect: eliminate all INVISIBLE characters in the buffer
void convertSpaceToInvisible(char *buf)
void removePunctuation(char *buf)
Clean up miscellaneous punctuation.
void removeLineComments(char *buf)
Remove comments that start at the beginning of a line.
void cleanUpPostscript(char *buf)
Remove newlines from buffer.
void convertWhitespaceToSpaceAndRemoveSpecialChars(char *buf, int isCR)
Convert white-space to real spaces, and remove unnecessary punctuation.
void ignoreFunctionCalls(char *buf)
Ignore function calls to print routines.
void removeHtmlComments(char *buf)
Remove HTML comments from buffer without removing comment text.
#define isEOL(x)
Check if x points to a EOL character.
#define NULL_CHAR
NULL character.
GArray * collapseInvisible(char *text, char invisible)
int idxGrep(int index, char *data, int flags)
compile a regex, and perform the search (on data?)