Mercurial > embed
comparison hoedown/src/autolink.c @ 2:2dce41ab17e8
hoedown: import 3.0.7
author | David Demelier <markand@malikania.fr> |
---|---|
date | Wed, 24 Feb 2016 20:54:52 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:4d89bd8a3f7f | 2:2dce41ab17e8 |
---|---|
1 #include "autolink.h" | |
2 | |
3 #include <string.h> | |
4 #include <stdlib.h> | |
5 #include <stdio.h> | |
6 #include <ctype.h> | |
7 | |
8 #ifndef _MSC_VER | |
9 #include <strings.h> | |
10 #else | |
11 #define strncasecmp _strnicmp | |
12 #endif | |
13 | |
14 int | |
15 hoedown_autolink_is_safe(const uint8_t *data, size_t size) | |
16 { | |
17 static const size_t valid_uris_count = 6; | |
18 static const char *valid_uris[] = { | |
19 "http://", "https://", "/", "#", "ftp://", "mailto:" | |
20 }; | |
21 static const size_t valid_uris_size[] = { 7, 8, 1, 1, 6, 7 }; | |
22 size_t i; | |
23 | |
24 for (i = 0; i < valid_uris_count; ++i) { | |
25 size_t len = valid_uris_size[i]; | |
26 | |
27 if (size > len && | |
28 strncasecmp((char *)data, valid_uris[i], len) == 0 && | |
29 isalnum(data[len])) | |
30 return 1; | |
31 } | |
32 | |
33 return 0; | |
34 } | |
35 | |
36 static size_t | |
37 autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size) | |
38 { | |
39 uint8_t cclose, copen = 0; | |
40 size_t i; | |
41 | |
42 for (i = 0; i < link_end; ++i) | |
43 if (data[i] == '<') { | |
44 link_end = i; | |
45 break; | |
46 } | |
47 | |
48 while (link_end > 0) { | |
49 if (strchr("?!.,:", data[link_end - 1]) != NULL) | |
50 link_end--; | |
51 | |
52 else if (data[link_end - 1] == ';') { | |
53 size_t new_end = link_end - 2; | |
54 | |
55 while (new_end > 0 && isalpha(data[new_end])) | |
56 new_end--; | |
57 | |
58 if (new_end < link_end - 2 && data[new_end] == '&') | |
59 link_end = new_end; | |
60 else | |
61 link_end--; | |
62 } | |
63 else break; | |
64 } | |
65 | |
66 if (link_end == 0) | |
67 return 0; | |
68 | |
69 cclose = data[link_end - 1]; | |
70 | |
71 switch (cclose) { | |
72 case '"': copen = '"'; break; | |
73 case '\'': copen = '\''; break; | |
74 case ')': copen = '('; break; | |
75 case ']': copen = '['; break; | |
76 case '}': copen = '{'; break; | |
77 } | |
78 | |
79 if (copen != 0) { | |
80 size_t closing = 0; | |
81 size_t opening = 0; | |
82 size_t i = 0; | |
83 | |
84 /* Try to close the final punctuation sign in this same line; | |
85 * if we managed to close it outside of the URL, that means that it's | |
86 * not part of the URL. If it closes inside the URL, that means it | |
87 * is part of the URL. | |
88 * | |
89 * Examples: | |
90 * | |
91 * foo http://www.pokemon.com/Pikachu_(Electric) bar | |
92 * => http://www.pokemon.com/Pikachu_(Electric) | |
93 * | |
94 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar | |
95 * => http://www.pokemon.com/Pikachu_(Electric) | |
96 * | |
97 * foo http://www.pokemon.com/Pikachu_(Electric)) bar | |
98 * => http://www.pokemon.com/Pikachu_(Electric)) | |
99 * | |
100 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar | |
101 * => foo http://www.pokemon.com/Pikachu_(Electric) | |
102 */ | |
103 | |
104 while (i < link_end) { | |
105 if (data[i] == copen) | |
106 opening++; | |
107 else if (data[i] == cclose) | |
108 closing++; | |
109 | |
110 i++; | |
111 } | |
112 | |
113 if (closing != opening) | |
114 link_end--; | |
115 } | |
116 | |
117 return link_end; | |
118 } | |
119 | |
120 static size_t | |
121 check_domain(uint8_t *data, size_t size, int allow_short) | |
122 { | |
123 size_t i, np = 0; | |
124 | |
125 if (!isalnum(data[0])) | |
126 return 0; | |
127 | |
128 for (i = 1; i < size - 1; ++i) { | |
129 if (strchr(".:", data[i]) != NULL) np++; | |
130 else if (!isalnum(data[i]) && data[i] != '-') break; | |
131 } | |
132 | |
133 if (allow_short) { | |
134 /* We don't need a valid domain in the strict sense (with | |
135 * least one dot; so just make sure it's composed of valid | |
136 * domain characters and return the length of the the valid | |
137 * sequence. */ | |
138 return i; | |
139 } else { | |
140 /* a valid domain needs to have at least a dot. | |
141 * that's as far as we get */ | |
142 return np ? i : 0; | |
143 } | |
144 } | |
145 | |
146 size_t | |
147 hoedown_autolink__www( | |
148 size_t *rewind_p, | |
149 hoedown_buffer *link, | |
150 uint8_t *data, | |
151 size_t max_rewind, | |
152 size_t size, | |
153 unsigned int flags) | |
154 { | |
155 size_t link_end; | |
156 | |
157 if (max_rewind > 0 && !ispunct(data[-1]) && !isspace(data[-1])) | |
158 return 0; | |
159 | |
160 if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) | |
161 return 0; | |
162 | |
163 link_end = check_domain(data, size, 0); | |
164 | |
165 if (link_end == 0) | |
166 return 0; | |
167 | |
168 while (link_end < size && !isspace(data[link_end])) | |
169 link_end++; | |
170 | |
171 link_end = autolink_delim(data, link_end, max_rewind, size); | |
172 | |
173 if (link_end == 0) | |
174 return 0; | |
175 | |
176 hoedown_buffer_put(link, data, link_end); | |
177 *rewind_p = 0; | |
178 | |
179 return (int)link_end; | |
180 } | |
181 | |
182 size_t | |
183 hoedown_autolink__email( | |
184 size_t *rewind_p, | |
185 hoedown_buffer *link, | |
186 uint8_t *data, | |
187 size_t max_rewind, | |
188 size_t size, | |
189 unsigned int flags) | |
190 { | |
191 size_t link_end, rewind; | |
192 int nb = 0, np = 0; | |
193 | |
194 for (rewind = 0; rewind < max_rewind; ++rewind) { | |
195 uint8_t c = data[-1 - rewind]; | |
196 | |
197 if (isalnum(c)) | |
198 continue; | |
199 | |
200 if (strchr(".+-_", c) != NULL) | |
201 continue; | |
202 | |
203 break; | |
204 } | |
205 | |
206 if (rewind == 0) | |
207 return 0; | |
208 | |
209 for (link_end = 0; link_end < size; ++link_end) { | |
210 uint8_t c = data[link_end]; | |
211 | |
212 if (isalnum(c)) | |
213 continue; | |
214 | |
215 if (c == '@') | |
216 nb++; | |
217 else if (c == '.' && link_end < size - 1) | |
218 np++; | |
219 else if (c != '-' && c != '_') | |
220 break; | |
221 } | |
222 | |
223 if (link_end < 2 || nb != 1 || np == 0 || | |
224 !isalpha(data[link_end - 1])) | |
225 return 0; | |
226 | |
227 link_end = autolink_delim(data, link_end, max_rewind, size); | |
228 | |
229 if (link_end == 0) | |
230 return 0; | |
231 | |
232 hoedown_buffer_put(link, data - rewind, link_end + rewind); | |
233 *rewind_p = rewind; | |
234 | |
235 return link_end; | |
236 } | |
237 | |
238 size_t | |
239 hoedown_autolink__url( | |
240 size_t *rewind_p, | |
241 hoedown_buffer *link, | |
242 uint8_t *data, | |
243 size_t max_rewind, | |
244 size_t size, | |
245 unsigned int flags) | |
246 { | |
247 size_t link_end, rewind = 0, domain_len; | |
248 | |
249 if (size < 4 || data[1] != '/' || data[2] != '/') | |
250 return 0; | |
251 | |
252 while (rewind < max_rewind && isalpha(data[-1 - rewind])) | |
253 rewind++; | |
254 | |
255 if (!hoedown_autolink_is_safe(data - rewind, size + rewind)) | |
256 return 0; | |
257 | |
258 link_end = strlen("://"); | |
259 | |
260 domain_len = check_domain( | |
261 data + link_end, | |
262 size - link_end, | |
263 flags & HOEDOWN_AUTOLINK_SHORT_DOMAINS); | |
264 | |
265 if (domain_len == 0) | |
266 return 0; | |
267 | |
268 link_end += domain_len; | |
269 while (link_end < size && !isspace(data[link_end])) | |
270 link_end++; | |
271 | |
272 link_end = autolink_delim(data, link_end, max_rewind, size); | |
273 | |
274 if (link_end == 0) | |
275 return 0; | |
276 | |
277 hoedown_buffer_put(link, data - rewind, link_end + rewind); | |
278 *rewind_p = rewind; | |
279 | |
280 return link_end; | |
281 } |