summaryrefslogtreecommitdiffstats
path: root/Src/nu/regexp.cpp
blob: 6908a49a5096510842805246cb7864d5c1bbc295 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#include "regexp.h"
// TODO: make a little more multi-byte safe



// regexp match functions

// A match means the entire string TEXT is used up in matching.
// In the pattern string:
//      `*' matches any sequence of characters (zero or more)
//      `?' matches any character
//      [SET] matches any character in the specified set,
//      [!SET] or [^SET] matches any character not in the specified set.

// A set is composed of characters or ranges; a range looks like
// character hyphen character (as in 0-9 or A-Z).  [0-9a-zA-Z_] is the
// minimal set of characters allowed in the [..] pattern construct.
// Other characters are allowed (ie. 8 bit characters) if your system
// will support them.

// To suppress the special syntactic significance of any of `[]*?!^-\',
// and match the character exactly, precede it with a `\'.

enum {
    MATCH_VALID = 1,    /* valid match */
    MATCH_END,        /* premature end of pattern string */
    MATCH_ABORT,      /* premature end of text string */
    MATCH_RANGE,      /* match failure on [..] construct */
    MATCH_LITERAL,    /* match failure on literal match */
    MATCH_PATTERN,    /* bad pattern */
};

enum {
    PATTERN_VALID = 0,    /* valid pattern */
    PATTERN_ESC = -1,     /* literal escape at end of pattern */
    PATTERN_RANGE = -2,   /* malformed range in [..] construct */
    PATTERN_CLOSE = -3,   /* no end bracket in [..] construct */
    PATTERN_EMPTY = -4,   /* [..] contstruct is empty */
};

int Matche(const regchar_t *p, const regchar_t *t);

// TODO: make this multi-byte aware
int matche_after_star(const regchar_t *p, const regchar_t *t)
{
	register int match = 0;
	register regchar_t nextp;
	/* pass over existing ? and * in pattern */
	while ( *p == '?' || *p == '*' )
	{
		/* take one char for each ? and + */
		if (*p == '?')
		{
			/* if end of text then no match */
			if (!*t++) return MATCH_ABORT;
		}
		/* move to next char in pattern */
		p++;
	}
	/* if end of pattern we have matched regardless of text left */
	if (!*p) return MATCH_VALID;
	/* get the next character to match which must be a literal or '[' */
	nextp = *p;
	if (nextp == '\\')
	{
		nextp = p[1];
		/* if end of text then we have a bad pattern */
		if (!nextp) return MATCH_PATTERN;
	}
	/* Continue until we run out of text or definite result seen */
	do
	{
		/* a precondition for matching is that the next character
		   in the pattern match the next character in the text or that
		   the next pattern char is the beginning of a range.  Increment
		   text pointer as we go here */
		if (nextp == *t || nextp == '[') match = Matche(p, t);
		/* if the end of text is reached then no match */
		if (!*t++) match = MATCH_ABORT;
	}
	while ( match != MATCH_VALID && match != MATCH_ABORT && match != MATCH_PATTERN);
	/* return result */
	return match;
}


int Matche(const regchar_t *p, const regchar_t *t)
{
	regchar_t range_start, range_end;  /* start and end in range */

	bool invert;             /* is this [..] or [!..] */
	bool member_match;       /* have I matched the [..] construct? */
	bool loop;               /* should I terminate? */

	for ( ; *p; p++, t++)
	{
		/* if this is the end of the text then this is the end of the match */
		if (!*t)
		{
			return (*p == '*' && *++p == '\0') ? MATCH_VALID : MATCH_ABORT;
		}
		/* determine and react to pattern type */
		switch (*p)
		{
		case '?':  /* single any character match */
			break;
		case '*':  /* multiple any character match */
			return matche_after_star (p, t);

			/* [..] construct, single member/exclusion character match */
		case '[':
			{
				/* move to beginning of range */
				p++;
				/* check if this is a member match or exclusion match */
				invert = false;
				if (*p == '!' || *p == '^')
				{
					invert = true;
					p++;
				}
				/* if closing bracket here or at range start then we have a malformed pattern */
				if (*p == ']')
					return MATCH_PATTERN;

				member_match = false;
				loop = true;
				while (loop)
				{
					/* if end of construct then loop is done */
					if (*p == ']')
					{
						loop = false;
						continue;
					}
					/* matching a '!', '^', '-', '\' or a ']' */
					if (*p == '\\')
						range_start = range_end = *++p;
					else
						range_start = range_end = *p;
					/* if end of pattern then bad pattern (Missing ']') */
					if (!*p)
						return MATCH_PATTERN;
					/* check for range bar */
					if (*++p == '-')
					{
						/* get the range end */
						range_end = *++p;
						/* if end of pattern or construct then bad pattern */
						if (range_end == '\0' || range_end == ']') return MATCH_PATTERN;
						/* special character range end */
						if (range_end == '\\')
						{
							range_end = *++p;
							/* if end of text then we have a bad pattern */
							if (!range_end) return MATCH_PATTERN;
						}
						/* move just beyond this range */
						p++;
					}
					/* if the text character is in range then match found.
					   make sure the range letters have the proper
					   relationship to one another before comparison */
					if (range_start < range_end)
					{
						if (*t >= range_start && *t <= range_end)
						{
							member_match = true;
							loop = false;
						}
					}
					else
					{
						if (*t >= range_end && *t <= range_start)
						{
							member_match = true;
							loop = false;
						}
					}
				}
				/* if there was a match in an exclusion set then no match */
				/* if there was no match in a member set then no match */
				if ((invert && member_match) || !(invert || member_match))
					return MATCH_RANGE;
				/* if this is not an exclusion then skip the rest of the [...] construct that already matched. */
				if (member_match)
				{
					while (p && *p != ']')
					{
						/* bad pattern (Missing ']') */
						if (!*p)
							return MATCH_PATTERN;
						/* skip exact match */
						if (*p == '\\')
						{
							p++;
							/* if end of text then we have a bad pattern */
							if (!*p)
								return MATCH_PATTERN;
						}
						/* move to next pattern char */
						p++;
					}
				}
				break;
			}
		case '\\':   /* next character is quoted and must match exactly */
			/* move pattern pointer to quoted char and fall through */
			p++;
			/* if end of text then we have a bad pattern */
			if (!*p)
				return MATCH_PATTERN;
			/* must match this character exactly */
		default:
			if (*p != *t)
				return MATCH_LITERAL;
		}
	}
	/* if end of text not reached then the pattern fails */
	if (*t)
		return MATCH_END;
	else return MATCH_VALID;
}

bool Match(const regchar_t *match, const regchar_t *string)
{
	if (!match)
		return true;
	int error_type;
	
	error_type = Matche(match, string);
	return (error_type == MATCH_VALID);
}