root/src/noit_tokenizer.re

Revision 88a71780101cbf23034aa0cb840f9f0368fda2dd, 5.7 kB (checked in by Theo Schlossnagle <jesus@omniti.com>, 6 years ago)

fixes #126

  • Property mode set to 100644
Line 
1 /*
2  * Copyright (c) 2007, OmniTI Computer Consulting, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are
7  * met:
8  *
9  *     * Redistributions of source code must retain the above copyright
10  *       notice, this list of conditions and the following disclaimer.
11  *     * Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *     * Neither the name OmniTI Computer Consulting, Inc. nor the names
16  *       of its contributors may be used to endorse or promote products
17  *       derived from this software without specific prior written
18  *       permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #include "noit_defines.h"
34 #include <stdlib.h>
35 #include <string.h>
36
37 struct token {
38   char *token;
39   const char *start;
40   const char *end;
41   const char *next;
42   enum { NT_IDENT, NT_DQSTRING, NT_SPACE, NT_UNKNOWN, NT_EOF } type;
43 };
44 #define SET_TOKEN(t,a) (t)->next = (a)
45
46 static void c_unescape(char *p, char *only) {
47   char *bt = p;
48 #define ASSIGN(a) *(bt++) = (a)
49   while(p[0] != '\0') {
50     if(p[0] == '\\' && p[1] != '\0' && (!only || p[1] == *only)) {
51       switch(p[1]) {
52         case ' ': ASSIGN(' '); p+=2; break;
53         case '"': ASSIGN('"'); p+=2; break;
54         case 'n': ASSIGN('\n'); p+=2; break;
55         case 'r': ASSIGN('\r'); p+=2; break;
56         case 't': ASSIGN('\t'); p+=2; break;
57         case 'a': ASSIGN('\a'); p+=2; break;
58         case 'b': ASSIGN('\b'); p+=2; break;
59         case 'v': ASSIGN('\v'); p+=2; break;
60         case 'f': ASSIGN('\f'); p+=2; break;
61         case '0': ASSIGN('\0'); p+=2; break;
62         case '\\': ASSIGN('\\'); p+=2; break;
63         default: ASSIGN(*p); p++; ASSIGN(*p); p++; break;
64       }
65     }
66     else {
67       ASSIGN(*p); p++;
68     }
69   }
70   *bt = '\0';
71 }
72
73 #define BAIL_UNKNOWN do { t->type = NT_UNKNOWN; return -1; } while(0)
74 static int token_scan(struct token *t)
75 {
76   t->start = t->end = t->next;
77
78  mainpattern:
79 /*!re2c
80     re2c:define:YYCTYPE  = "unsigned char";
81     re2c:define:YYCURSOR = t->next;
82     re2c:yyfill:enable   = 0;
83     re2c:yych:conversion = 1;
84     re2c:indent:top      = 1;
85
86     [ \t\r\n]+      { t->token = NULL;
87                       t->end = t->next;
88                       t->type = NT_SPACE;
89                       return 1; }
90     ["]             { t->type = NT_DQSTRING;
91                       if(t->start != t->end) {
92                         t->start++;
93                         t->end = t->next - 1;
94                         t->token = malloc(t->end-t->start + 1);
95                         strlcpy(t->token, t->start, t->end-t->start + 1);
96                         c_unescape(t->token, NULL);
97                         return 1;
98                       }
99                       else
100                         goto dqstring;
101                     }
102     "'"             { t->type = NT_IDENT;
103                       if(t->start != t->end) {
104                         t->start++;
105                         t->end = t->next - 1;
106                         t->token = malloc(t->end-t->start + 1);
107                         strlcpy(t->token, t->start, t->end-t->start + 1);
108                         return 1;
109                       }
110                       else
111                         goto sqstring;
112                     }
113     [^\000'" \t\r\n] ([^\000 \t\r\n]|[\\][ ])*
114                     { char only = ' ';
115                       t->end = t->next;
116                       t->type = NT_IDENT;
117                       t->token = malloc(t->end-t->start + 1);
118                       strlcpy(t->token, t->start, t->end-t->start + 1);
119                       c_unescape(t->token, &only);
120                       return 1;
121                     }
122     [\000]          { t->token = NULL;
123                       t->type = NT_EOF;
124                       return 0;
125                     }
126     [\000-\377]     { BAIL_UNKNOWN; }
127 */
128
129  sqstring:
130 /*!re2c
131     [^'\000]*       { t->end = t->next;
132                       goto mainpattern; }
133     [\000]          { BAIL_UNKNOWN; }
134 */
135
136  dqstring:
137 /*!re2c
138     [\\][nrtabvf0"\\]
139                     { goto dqstring; }
140     "\\" ( [^\000] \ [nrtabvf0"\\] )
141                     { goto dqstring; }
142     ["]             { t->end = t->next--;
143                       goto mainpattern;
144                     }
145     [^"\000]\[\\"]  { goto dqstring; }
146     [\000]          { BAIL_UNKNOWN; }
147 */
148 }
149
150 int noit_tokenize(const char *input, char **vector, int *cnt) {
151   struct token t;
152   int i = 0;
153
154   SET_TOKEN(&t, input);
155   while(token_scan(&t) != -1) {
156     switch(t.type) {
157       case NT_IDENT:
158       case NT_DQSTRING:
159         if(i<*cnt) vector[i] = t.token;
160         i++;
161         break;
162       case NT_SPACE:
163         break;
164       case NT_EOF:
165         if(i<*cnt) *cnt = i;
166         return i;
167       case NT_UNKNOWN:
168         /* UNREACHED */
169         goto failure;
170     }
171   }
172  failure:
173   if(i<*cnt) *cnt = i;
174   return input - t.next;
175 }
Note: See TracBrowser for help on using the browser.