15 #include "string_operations.h"
19 #define MAX_TOKENS_ARRAY_SIZE 4194304
20 #define MAX_DELIMIT_LEN 255
22 unsigned splittingDelim(
char a,
const char* delimiters) {
25 const char * ptr = delimiters;
35 unsigned specialDelim(
const char* z){
49 else if( a==
':' && b==
':') {
52 else if ((a==b && b==c) && (a==
'"' || a==
'\'')) {
55 else if (a==
'd' && b==
'n' && c==
'l') {
62 static inline void initStateToken(
Token* stateToken) {
63 stateToken->hashedContent = hash_init();
64 stateToken->length = 0;
65 stateToken->removedBefore = 0;
68 static int isIgnoredToken(
Token* token) {
71 #ifndef MONK_CASE_INSENSITIVE
72 remToken.hashedContent = hash(
"REM");
74 remToken.hashedContent = hash(
"rem");
77 remToken.removedBefore = 0;
79 return tokenEquals(token, &remToken);
82 int streamTokenize(
const char* inputChunk,
size_t inputSize,
const char* delimiters, GArray** output,
Token** remainder) {
83 GArray* tokens = *output;
86 unsigned int initialTokenCount = tokens->len;
89 if ((stateToken = *remainder)) {
90 if ((stateToken->length > 0) && !isIgnoredToken(stateToken)) {
91 g_array_append_val(tokens, *stateToken);
101 stateToken = malloc(
sizeof (
Token));
102 *remainder = stateToken;
103 initStateToken(stateToken);
105 stateToken = *remainder;
108 if (tokens->len >= MAX_TOKENS_ARRAY_SIZE) {
109 printf(
"WARNING: stream has more tokens than maximum allowed\n");
113 const char* ptr = inputChunk;
115 size_t readBytes = 0;
116 while (readBytes < inputSize) {
117 unsigned delimLen = 0;
118 if (inputSize - readBytes >= 2) {
119 delimLen = specialDelim(ptr);
122 delimLen = splittingDelim(*ptr, delimiters);
126 if (stateToken->length > 0) {
127 if (isIgnoredToken(stateToken)) {
128 stateToken->removedBefore += stateToken->length;
129 stateToken->length = 0;
130 stateToken->hashedContent = hash_init();
132 g_array_append_val(tokens, *stateToken);
133 initStateToken(stateToken);
137 stateToken->removedBefore += delimLen;
140 readBytes += delimLen;
142 #ifndef MONK_CASE_INSENSITIVE
143 const char* newCharPtr = ptr;
145 char newChar = g_ascii_tolower(*ptr);
146 const char* newCharPtr = &newChar;
148 hash_add(newCharPtr, &(stateToken->hashedContent));
150 stateToken->length++;
157 return tokens->len - initialTokenCount;
160 GArray* tokenize(
const char* inputString,
const char* delimiters) {
161 GArray* tokenArray = tokens_new();
163 Token* remainder = NULL;
165 size_t inputLength = strlen(inputString);
168 size_t chunksCount = inputLength / CHUNKS;
169 for (
size_t i = 0; i < chunksCount; i++) {
170 int addedTokens = streamTokenize(inputString + i * CHUNKS, CHUNKS, delimiters, &tokenArray, &remainder);
171 if (addedTokens < 0) {
172 printf(
"WARNING: can not complete tokenizing of '%.30s...'\n", inputString);
176 streamTokenize(inputString + chunksCount * CHUNKS,
MIN(CHUNKS, inputLength - chunksCount * CHUNKS),
177 delimiters, &tokenArray, &remainder);
178 streamTokenize(NULL, 0, NULL, &tokenArray, &remainder);
184 int tokensEquals(
const GArray* a,
const GArray* b) {
185 if (b->len != a->len)
188 for (
size_t i = 0; i < a->len; i++) {
189 Token* aToken = tokens_index(a, i);
190 Token* bToken = tokens_index(b, i);
192 if (!tokenEquals(aToken, bToken))
199 size_t token_position_of(
size_t index,
const GArray* tokens) {
201 size_t previousLength = 0;
203 size_t limit =
MIN(index + 1, tokens->len);
205 for (
size_t i = 0; i < limit; i++) {
206 Token* token = tokens_index(tokens, i);
207 result += token->removedBefore + previousLength;
208 previousLength = token_length(*token);
211 if (index == tokens->len) {
212 result += previousLength;
215 if (index > tokens->len) {
216 result += previousLength;
217 printf(
"WARNING: requested calculation of token index after the END token\n");
223 inline char* normalize_escape_string(
char* input)
227 char ret[MAX_DELIMIT_LEN];
296 return g_strdup(ret);
#define MIN(a, b)
Min of two.