Annotation of parser3/src/lib/cord/cordxtra.c, revision 1.8
1.2 paf 1: /*
2: * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
3: *
4: * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5: * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
6: *
7: * Permission is hereby granted to use or copy this program
8: * for any purpose, provided the above notices are retained on all copies.
9: * Permission to modify the code and to distribute modified code is granted,
10: * provided the above notices are retained, and a notice that the code was
11: * modified is included with the above copyright notice.
12: *
13: * Author: Hans-J. Boehm (boehm@parc.xerox.com)
14: */
15: /*
16: * These are functions on cords that do not need to understand their
17: * implementation. They serve also serve as example client code for
18: * cord_basics.
19: */
20: /* Boehm, December 8, 1995 1:53 pm PST */
1.7 paf 21: #include "pa_config_includes.h"
1.2 paf 22: # include <stdio.h>
23: # include <string.h>
24: # include <stdlib.h>
25: # include <stdarg.h>
26: # include "cord.h"
27: # include "ec.h"
28: # define I_HIDE_POINTERS /* So we get access to allocation lock. */
29: /* We use this for lazy file reading, */
30: /* so that we remain independent */
31: /* of the threads primitives. */
32: # include "gc.h"
33:
34: /* For now we assume that pointer reads and writes are atomic, */
35: /* i.e. another thread always sees the state before or after */
36: /* a write. This might be false on a Motorola M68K with */
37: /* pointers that are not 32-bit aligned. But there probably */
38: /* aren't too many threads packages running on those. */
39: # define ATOMIC_WRITE(x,y) (x) = (y)
40: # define ATOMIC_READ(x) (*(x))
41:
42: /* The standard says these are in stdio.h, but they aren't always: */
43: # ifndef SEEK_SET
44: # define SEEK_SET 0
45: # endif
46: # ifndef SEEK_END
47: # define SEEK_END 2
48: # endif
49:
50: # define BUFSZ 2048 /* Size of stack allocated buffers when */
51: /* we want large buffers. */
52:
53: typedef void (* oom_fn)(void);
54:
55: # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
56: ABORT("Out of memory\n"); }
57: # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
58:
59: CORD CORD_cat_char(CORD x, char c)
60: {
61: register char * string;
62:
63: if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
64: string = GC_MALLOC_ATOMIC(2);
65: if (string == 0) OUT_OF_MEMORY;
66: string[0] = c;
67: string[1] = '\0';
68: return(CORD_cat_char_star(x, string, 1));
69: }
70:
71: CORD CORD_catn(int nargs, ...)
72: {
73: register CORD result = CORD_EMPTY;
74: va_list args;
75: register int i;
76:
77: va_start(args, nargs);
78: for (i = 0; i < nargs; i++) {
79: register CORD next = va_arg(args, CORD);
80: result = CORD_cat(result, next);
81: }
82: va_end(args);
83: return(result);
84: }
85:
86: typedef struct {
87: size_t len;
88: size_t count;
89: char * buf;
90: } CORD_fill_data;
91:
92: int CORD_fill_proc(char c, void * client_data)
93: {
94: register CORD_fill_data * d = (CORD_fill_data *)client_data;
95: register size_t count = d -> count;
96:
97: (d -> buf)[count] = c;
98: d -> count = ++count;
99: if (count >= d -> len) {
100: return(1);
101: } else {
102: return(0);
103: }
104: }
105:
106: int CORD_batched_fill_proc(const char* s, void * client_data)
107: {
108: register CORD_fill_data * d = (CORD_fill_data *)client_data;
109: register size_t count = d -> count;
110: register size_t max = d -> len;
111: register char * buf = d -> buf;
112: register const char* t = s;
113:
114: while((buf[count] = *t++) != '\0') {
115: count++;
116: if (count >= max) {
117: d -> count = count;
118: return(1);
119: }
120: }
121: d -> count = count;
122: return(0);
123: }
124:
125: /* Fill buf with len characters starting at i. */
126: /* Assumes len characters are available. */
127: void CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
128: {
129: CORD_fill_data fd;
130:
131: fd.len = len;
132: fd.buf = buf;
133: fd.count = 0;
134: (void)CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
135: }
136:
137: int CORD_cmp(CORD x, CORD y)
138: {
139: CORD_pos xpos;
140: CORD_pos ypos;
141: register size_t avail, yavail;
142:
143: if (y == CORD_EMPTY) return(x != CORD_EMPTY);
144: if (x == CORD_EMPTY) return(-1);
145: if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
146: CORD_set_pos(xpos, x, 0);
147: CORD_set_pos(ypos, y, 0);
148: for(;;) {
149: if (!CORD_pos_valid(xpos)) {
150: if (CORD_pos_valid(ypos)) {
151: return(-1);
152: } else {
153: return(0);
154: }
155: }
156: if (!CORD_pos_valid(ypos)) {
157: return(1);
158: }
159: if ((avail = CORD_pos_chars_left(xpos)) <= 0
160: || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
161: register char xcurrent = CORD_pos_fetch(xpos);
162: register char ycurrent = CORD_pos_fetch(ypos);
163: if (xcurrent != ycurrent) return(xcurrent - ycurrent);
164: CORD_next(xpos);
165: CORD_next(ypos);
166: } else {
167: /* process as many characters as we can */
168: register int result;
169:
170: if (avail > yavail) avail = yavail;
171: result = strncmp(CORD_pos_cur_char_addr(xpos),
172: CORD_pos_cur_char_addr(ypos), avail);
173: if (result != 0) return(result);
174: CORD_pos_advance(xpos, avail);
175: CORD_pos_advance(ypos, avail);
176: }
177: }
178: }
179:
180: int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
181: {
182: CORD_pos xpos;
183: CORD_pos ypos;
184: register size_t count;
185: register long avail, yavail;
186:
187: CORD_set_pos(xpos, x, x_start);
188: CORD_set_pos(ypos, y, y_start);
189: for(count = 0; count < len;) {
190: if (!CORD_pos_valid(xpos)) {
191: if (CORD_pos_valid(ypos)) {
192: return(-1);
193: } else {
194: return(0);
195: }
196: }
197: if (!CORD_pos_valid(ypos)) {
198: return(1);
199: }
200: if ((avail = CORD_pos_chars_left(xpos)) <= 0
201: || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
202: register char xcurrent = CORD_pos_fetch(xpos);
203: register char ycurrent = CORD_pos_fetch(ypos);
204: if (xcurrent != ycurrent) return(xcurrent - ycurrent);
205: CORD_next(xpos);
206: CORD_next(ypos);
207: count++;
208: } else {
209: /* process as many characters as we can */
210: register int result;
211:
212: if (avail > yavail) avail = yavail;
213: count += avail;
214: if (count > len) avail -= (count - len);
215: result = strncmp(CORD_pos_cur_char_addr(xpos),
216: CORD_pos_cur_char_addr(ypos), (size_t)avail);
217: if (result != 0) return(result);
218: CORD_pos_advance(xpos, (size_t)avail);
219: CORD_pos_advance(ypos, (size_t)avail);
220: }
221: }
222: return(0);
223: }
224:
225: char * CORD_to_char_star(CORD x)
226: {
227: register size_t len = CORD_len(x);
228: char * result = GC_MALLOC_ATOMIC(len + 1);
229:
230: if (result == 0) OUT_OF_MEMORY;
231: CORD_fill_buf(x, 0, len, result);
232: result[len] = '\0';
233: return(result);
234: }
235:
236: CORD CORD_from_char_star(const char* s)
237: {
238: char * result;
239: size_t len = strlen(s);
240:
241: if (0 == len) return(CORD_EMPTY);
242: result = GC_MALLOC_ATOMIC(len + 1);
243: if (result == 0) OUT_OF_MEMORY;
244: memcpy(result, s, len+1);
245: return(result);
246: }
247:
248: const char* CORD_to_const_char_star(CORD x)
249: {
250: if (x == 0) return("");
251: if (CORD_IS_STRING(x)) return((const char* )x);
252: return(CORD_to_char_star(x));
253: }
254:
255: char CORD_fetch(CORD x, size_t i)
256: {
257: CORD_pos xpos;
258:
259: CORD_set_pos(xpos, x, i);
260: if (!CORD_pos_valid(xpos)) ABORT("bad index?");
261: return(CORD_pos_fetch(xpos));
262: }
263:
264:
265: int CORD_put_proc(char c, void * client_data)
266: {
267: register FILE * f = (FILE *)client_data;
268:
269: return(putc(c, f) == EOF);
270: }
271:
272: int CORD_batched_put_proc(const char* s, void * client_data)
273: {
274: register FILE * f = (FILE *)client_data;
275:
276: return(fputs(s, f) == EOF);
277: }
278:
279:
280: int CORD_put(CORD x, FILE * f)
281: {
282: if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
283: return(EOF);
284: } else {
285: return(1);
286: }
287: }
288:
289: typedef struct {
290: size_t pos; /* Current position in the cord */
291: char target; /* Character we're looking for */
292: } chr_data;
293:
294: int CORD_chr_proc(char c, void * client_data)
295: {
296: register chr_data * d = (chr_data *)client_data;
297:
298: if (c == d -> target) return(1);
299: (d -> pos) ++;
300: return(0);
301: }
1.4 paf 302:
1.2 paf 303: int CORD_rchr_proc(char c, void * client_data)
304: {
305: register chr_data * d = (chr_data *)client_data;
306:
307: if (c == d -> target) return(1);
308: (d -> pos) --;
309: return(0);
310: }
311:
312: int CORD_batched_chr_proc(const char* s, void * client_data)
313: {
314: register chr_data * d = (chr_data *)client_data;
315: register char * occ = strchr(s, d -> target);
316:
317: if (occ == 0) {
318: d -> pos += strlen(s);
319: return(0);
320: } else {
321: d -> pos += occ - s;
322: return(1);
323: }
324: }
325:
326: size_t CORD_chr(CORD x, size_t i, int c)
327: {
328: chr_data d;
329:
330: d.pos = i;
331: d.target = c;
332: if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
333: return(d.pos);
334: } else {
335: return(CORD_NOT_FOUND);
336: }
337: }
338:
339: size_t CORD_rchr(CORD x, size_t i, int c)
340: {
341: chr_data d;
342:
343: d.pos = i;
344: d.target = c;
345: if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
346: return(d.pos);
347: } else {
348: return(CORD_NOT_FOUND);
349: }
350: }
351:
352: /* Find the first occurrence of s in x at position start or later. */
353: /* This uses an asymptotically poor algorithm, which should typically */
354: /* perform acceptably. We compare the first few characters directly, */
355: /* and call CORD_ncmp whenever there is a partial match. */
356: /* This has the advantage that we allocate very little, or not at all. */
357: /* It's very fast if there are few close misses. */
358: size_t CORD_str(CORD x, size_t start, CORD s)
359: {
360: CORD_pos xpos;
361: size_t xlen = CORD_len(x);
362: size_t slen;
363: register size_t start_len;
364: const char* s_start;
365: unsigned long s_buf = 0; /* The first few characters of s */
366: unsigned long x_buf = 0; /* Start of candidate substring. */
367: /* Initialized only to make compilers */
368: /* happy. */
369: unsigned long mask = 0;
370: register size_t i;
371: register size_t match_pos;
372:
373: if (s == CORD_EMPTY) return(start);
374: if (CORD_IS_STRING(s)) {
375: s_start = s;
376: slen = strlen(s);
377: } else {
378: s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
379: slen = CORD_len(s);
380: }
381: if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
382: start_len = slen;
383: if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
384: CORD_set_pos(xpos, x, start);
385: for (i = 0; i < start_len; i++) {
386: mask <<= 8;
387: mask |= 0xff;
388: s_buf <<= 8;
1.3 paf 389: s_buf |= (unsigned char)s_start[i];
1.2 paf 390: x_buf <<= 8;
1.5 paf 391: x_buf |= (unsigned char)CORD_pos_fetch(xpos);
1.2 paf 392: CORD_next(xpos);
393: }
394: for (match_pos = start; ; match_pos++) {
395: if ((x_buf & mask) == s_buf) {
396: if (slen == start_len ||
397: CORD_ncmp(x, match_pos + start_len,
398: s, start_len, slen - start_len) == 0) {
399: return(match_pos);
400: }
401: }
402: if ( match_pos == xlen - slen ) {
403: return(CORD_NOT_FOUND);
404: }
405: x_buf <<= 8;
1.5 paf 406: x_buf |= (unsigned char)CORD_pos_fetch(xpos);
1.2 paf 407: CORD_next(xpos);
408: }
409: }
410:
411: void CORD_ec_flush_buf(CORD_ec x)
412: {
413: register size_t len = x[0].ec_bufptr - x[0].ec_buf;
414: char * s;
415:
416: if (len == 0) return;
417: s = GC_MALLOC_ATOMIC(len+1);
418: memcpy(s, x[0].ec_buf, len);
419: s[len] = '\0';
420: x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
421: x[0].ec_bufptr = x[0].ec_buf;
422: }
423:
424: void CORD_ec_append_cord(CORD_ec x, CORD s)
425: {
426: CORD_ec_flush_buf(x);
427: x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
428: }
429:
430: /*ARGSUSED*/
431: char CORD_nul_func(size_t i, void * client_data)
432: {
433: return((char)(unsigned long)client_data);
1.4 paf 434: }
435:
1.2 paf 436: CORD CORD_chars(char c, size_t i)
437: {
1.8 ! misha 438: if (i>0 && i<16 /* SHORT_LIMIT */) {
! 439: register char* result;
! 440: result=GC_MALLOC_ATOMIC(i+1);
! 441: if(result==0) OUT_OF_MEMORY;
! 442: memset(result, c, i);
! 443: result[i] = '\0';
! 444: return((CORD) result);
! 445: } else {
! 446: return(CORD_from_fn(CORD_nul_func, (void *)(unsigned long)c, i));
! 447: }
1.2 paf 448: }
449:
450: CORD CORD_from_file_eager(FILE * f)
451: {
452: register int c;
453: CORD_ec ecord;
454:
455: CORD_ec_init(ecord);
456: for(;;) {
457: c = getc(f);
458: if (c == 0) {
459: /* Append the right number of NULs */
460: /* Note that any string of NULs is rpresented in 4 words, */
461: /* independent of its length. */
462: register size_t count = 1;
463:
464: CORD_ec_flush_buf(ecord);
465: while ((c = getc(f)) == 0) count++;
466: ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
467: }
468: if (c == EOF) break;
469: CORD_ec_append(ecord, c);
470: }
471: (void) fclose(f);
472: return(CORD_balance(CORD_ec_to_cord(ecord)));
473: }
474:
475: /* The state maintained for a lazily read file consists primarily */
476: /* of a large direct-mapped cache of previously read values. */
477: /* We could rely more on stdio buffering. That would have 2 */
478: /* disadvantages: */
479: /* 1) Empirically, not all fseek implementations preserve the */
480: /* buffer whenever they could. */
481: /* 2) It would fail if 2 different sections of a long cord */
482: /* were being read alternately. */
483: /* We do use the stdio buffer for read ahead. */
484: /* To guarantee thread safety in the presence of atomic pointer */
485: /* writes, cache lines are always replaced, and never modified in */
486: /* place. */
487:
488: # define LOG_CACHE_SZ 14
489: # define CACHE_SZ (1 << LOG_CACHE_SZ)
490: # define LOG_LINE_SZ 9
491: # define LINE_SZ (1 << LOG_LINE_SZ)
492:
493: typedef struct {
494: size_t tag;
495: char data[LINE_SZ];
496: /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */
497: } cache_line;
498:
499: typedef struct {
500: FILE * lf_file;
501: size_t lf_current; /* Current file pointer value */
502: cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
503: } lf_state;
504:
505: # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
506: # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
507: # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
508: # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
509: # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
510:
511: typedef struct {
512: lf_state * state;
513: size_t file_pos; /* Position of needed character. */
514: cache_line * new_cache;
515: } refill_data;
516:
517: /* Executed with allocation lock. */
518: static char refill_cache(client_data)
519: refill_data * client_data;
520: {
521: register lf_state * state = client_data -> state;
522: register size_t file_pos = client_data -> file_pos;
523: FILE *f = state -> lf_file;
524: size_t line_start = LINE_START(file_pos);
525: size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
526: cache_line * new_cache = client_data -> new_cache;
527:
528: if (line_start != state -> lf_current
529: && fseek(f, line_start, SEEK_SET) != 0) {
530: ABORT("fseek failed");
531: }
532: if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
533: <= file_pos - line_start) {
534: ABORT("fread failed");
535: }
536: new_cache -> tag = DIV_LINE_SZ(file_pos);
537: /* Store barrier goes here. */
538: ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
539: state -> lf_current = line_start + LINE_SZ;
540: return(new_cache->data[MOD_LINE_SZ(file_pos)]);
541: }
542:
543: char CORD_lf_func(size_t i, void * client_data)
544: {
545: register lf_state * state = (lf_state *)client_data;
546: register cache_line * volatile * cl_addr =
547: &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
548: register cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
549:
550: if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
551: /* Cache miss */
552: refill_data rd;
553:
554: rd.state = state;
555: rd.file_pos = i;
556: rd.new_cache = GC_NEW_ATOMIC(cache_line);
557: if (rd.new_cache == 0) OUT_OF_MEMORY;
558: return((char)(GC_word)
559: GC_call_with_alloc_lock((GC_fn_type) refill_cache, &rd));
560: }
561: return(cl -> data[MOD_LINE_SZ(i)]);
562: }
563:
564: /*ARGSUSED*/
565: void CORD_lf_close_proc(void * obj, void * client_data)
566: {
567: if (fclose(((lf_state *)obj) -> lf_file) != 0) {
568: ABORT("CORD_lf_close_proc: fclose failed");
569: }
570: }
571:
572: #ifndef PA_DEBUG_DISABLE_GC
573: CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
574: {
575: register lf_state * state = GC_NEW(lf_state);
576: register int i;
577:
578: if (state == 0) OUT_OF_MEMORY;
579: if (len != 0) {
580: /* Dummy read to force buffer allocation. */
581: /* This greatly increases the probability */
582: /* of avoiding deadlock if buffer allocation */
583: /* is redirected to GC_malloc and the */
584: /* world is multithreaded. */
585: char buf[1];
586:
587: (void) fread(buf, 1, 1, f);
588: rewind(f);
589: }
590: state -> lf_file = f;
591: for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
592: state -> lf_cache[i] = 0;
593: }
594: state -> lf_current = 0;
595: GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
596: return(CORD_from_fn(CORD_lf_func, state, len));
597: }
598:
599: CORD CORD_from_file_lazy(FILE * f)
600: {
601: register long len;
602:
603: if (fseek(f, 0l, SEEK_END) != 0) {
604: ABORT("Bad fd argument - fseek failed");
605: }
606: if ((len = ftell(f)) < 0) {
607: ABORT("Bad fd argument - ftell failed");
608: }
609: rewind(f);
610: return(CORD_from_file_lazy_inner(f, (size_t)len));
611: }
612:
613: # define LAZY_THRESHOLD (128*1024 + 1)
614:
615: CORD CORD_from_file(FILE * f)
616: {
617: register long len;
618:
619: if (fseek(f, 0l, SEEK_END) != 0) {
620: ABORT("Bad fd argument - fseek failed");
621: }
622: if ((len = ftell(f)) < 0) {
623: ABORT("Bad fd argument - ftell failed");
624: }
625: rewind(f);
626: if (len < LAZY_THRESHOLD) {
627: return(CORD_from_file_eager(f));
628: } else {
629: return(CORD_from_file_lazy_inner(f, (size_t)len));
630: }
631: }
632: #endif
E-mail: