FOSSology  4.4.0
Open Source License Compliance by Open Source Software
file_operations.c
1 /*
2  Author: Daniele Fognini, Andreas Wuerl
3  SPDX-FileCopyrightText: © 2013-2014 Siemens AG
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
8 #include "file_operations.h"
9 #include <sys/stat.h>
10 #include <unistd.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 
14 #include <fcntl.h>
15 
16 #include "hash.h"
17 #include "string_operations.h"
18 #include "encoding.h"
19 
20 #define BUFFSIZE 4096
21 
22 int readTokensFromFile(const char* fileName, GArray** tokens, const char* delimiters)
23 {
24  int fd = open(fileName, O_RDONLY);
25  if (fd < 0)
26  {
27  printf("FATAL: can not open %s\n", fileName);
28  return 0;
29  }
30 
31  *tokens = tokens_new();
32 
33  int needConverter = 1;
34  iconv_t converter = NULL;
35 
36  Token* remainder = NULL;
37 
38  char buffer[BUFFSIZE];
39  char convertedBuffer[BUFFSIZE];
40 
41  ssize_t n;
42  size_t leftFromLast = 0;
43  while ((n = read(fd, buffer + leftFromLast, sizeof(buffer) - leftFromLast)) > 0)
44  {
45  size_t len = (size_t) n + leftFromLast;
46  char* chunk = buffer;
47  leftFromLast = 0;
48 
49  if (needConverter)
50  {
51  needConverter = 0;
52  converter = guessConverter(buffer, len);
53  }
54 
55  if (converter)
56  {
57  char* input = buffer;
58  size_t inputLeft = len;
59 
60  char* output = convertedBuffer;
61  size_t outputLength = sizeof(convertedBuffer);
62  iconv(converter, &input, &inputLeft, &output, &outputLength);
63 
64  if (outputLength != sizeof(convertedBuffer)) {
65  chunk = convertedBuffer;
66  len = sizeof(convertedBuffer) - outputLength;
67 
68  leftFromLast = inputLeft;
69  for (size_t i = 0; i < leftFromLast; i++)
70  {
71  buffer[i] = *input++;
72  }
73  } else {
74  // the raw buffer is full and we could not write to the converted buffer
75  printf("WARNING: cannot re-encode '%s', going binary from now on\n", fileName);
76  iconv_close(converter);
77  converter = NULL;
78  }
79  }
80 
81  /* N.B. this tokenizes inside the re-encoded buffer:
82  * the offsets found are byte positions in the UTF-8 stream, not file positions
83  **/
84  int addedTokens = streamTokenize(chunk, len, delimiters, tokens, &remainder);
85  if (addedTokens < 0)
86  {
87  printf("WARNING: can not complete tokenizing of '%s'\n", fileName);
88  break;
89  }
90  }
91 
92  streamTokenize(buffer, leftFromLast, delimiters, tokens, &remainder);
93  streamTokenize(NULL, 0, NULL, tokens, &remainder);
94 
95  close(fd);
96 
97  if (converter)
98  {
99  iconv_close(converter);
100  }
101 
102  return 1;
103 }
char buffer[2048]
The last thing received from the scheduler.