Annotation of parser3/src/lib/cord/cordxtra.c, revision 1.9
1.2 paf 1: /*
2: * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
3: *
4: * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5: * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
6: *
7: * Permission is hereby granted to use or copy this program
8: * for any purpose, provided the above notices are retained on all copies.
9: * Permission to modify the code and to distribute modified code is granted,
10: * provided the above notices are retained, and a notice that the code was
11: * modified is included with the above copyright notice.
12: *
13: * Author: Hans-J. Boehm (boehm@parc.xerox.com)
14: */
15: /*
16: * These are functions on cords that do not need to understand their
17: * implementation. They serve also serve as example client code for
18: * cord_basics.
19: */
20: /* Boehm, December 8, 1995 1:53 pm PST */
1.7 paf 21: #include "pa_config_includes.h"
1.2 paf 22: # include <stdio.h>
23: # include <string.h>
24: # include <stdlib.h>
25: # include <stdarg.h>
26: # include "cord.h"
27: # include "ec.h"
1.9 ! misha 28:
! 29: #define CORD_CHARS_CACHE
! 30:
1.2 paf 31: # define I_HIDE_POINTERS /* So we get access to allocation lock. */
32: /* We use this for lazy file reading, */
33: /* so that we remain independent */
34: /* of the threads primitives. */
35: # include "gc.h"
36:
37: /* For now we assume that pointer reads and writes are atomic, */
38: /* i.e. another thread always sees the state before or after */
39: /* a write. This might be false on a Motorola M68K with */
40: /* pointers that are not 32-bit aligned. But there probably */
41: /* aren't too many threads packages running on those. */
42: # define ATOMIC_WRITE(x,y) (x) = (y)
43: # define ATOMIC_READ(x) (*(x))
44:
45: /* The standard says these are in stdio.h, but they aren't always: */
46: # ifndef SEEK_SET
47: # define SEEK_SET 0
48: # endif
49: # ifndef SEEK_END
50: # define SEEK_END 2
51: # endif
52:
53: # define BUFSZ 2048 /* Size of stack allocated buffers when */
54: /* we want large buffers. */
55:
56: typedef void (* oom_fn)(void);
57:
58: # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
59: ABORT("Out of memory\n"); }
60: # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
61:
62: CORD CORD_cat_char(CORD x, char c)
63: {
64: register char * string;
65:
66: if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
67: string = GC_MALLOC_ATOMIC(2);
68: if (string == 0) OUT_OF_MEMORY;
69: string[0] = c;
70: string[1] = '\0';
71: return(CORD_cat_char_star(x, string, 1));
72: }
73:
74: CORD CORD_catn(int nargs, ...)
75: {
76: register CORD result = CORD_EMPTY;
77: va_list args;
78: register int i;
79:
80: va_start(args, nargs);
81: for (i = 0; i < nargs; i++) {
82: register CORD next = va_arg(args, CORD);
83: result = CORD_cat(result, next);
84: }
85: va_end(args);
86: return(result);
87: }
88:
89: typedef struct {
90: size_t len;
91: size_t count;
92: char * buf;
93: } CORD_fill_data;
94:
95: int CORD_fill_proc(char c, void * client_data)
96: {
97: register CORD_fill_data * d = (CORD_fill_data *)client_data;
98: register size_t count = d -> count;
99:
100: (d -> buf)[count] = c;
101: d -> count = ++count;
102: if (count >= d -> len) {
103: return(1);
104: } else {
105: return(0);
106: }
107: }
108:
109: int CORD_batched_fill_proc(const char* s, void * client_data)
110: {
111: register CORD_fill_data * d = (CORD_fill_data *)client_data;
112: register size_t count = d -> count;
113: register size_t max = d -> len;
114: register char * buf = d -> buf;
115: register const char* t = s;
116:
117: while((buf[count] = *t++) != '\0') {
118: count++;
119: if (count >= max) {
120: d -> count = count;
121: return(1);
122: }
123: }
124: d -> count = count;
125: return(0);
126: }
127:
128: /* Fill buf with len characters starting at i. */
129: /* Assumes len characters are available. */
130: void CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
131: {
132: CORD_fill_data fd;
133:
134: fd.len = len;
135: fd.buf = buf;
136: fd.count = 0;
137: (void)CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
138: }
139:
140: int CORD_cmp(CORD x, CORD y)
141: {
142: CORD_pos xpos;
143: CORD_pos ypos;
144: register size_t avail, yavail;
145:
146: if (y == CORD_EMPTY) return(x != CORD_EMPTY);
147: if (x == CORD_EMPTY) return(-1);
148: if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
149: CORD_set_pos(xpos, x, 0);
150: CORD_set_pos(ypos, y, 0);
151: for(;;) {
152: if (!CORD_pos_valid(xpos)) {
153: if (CORD_pos_valid(ypos)) {
154: return(-1);
155: } else {
156: return(0);
157: }
158: }
159: if (!CORD_pos_valid(ypos)) {
160: return(1);
161: }
162: if ((avail = CORD_pos_chars_left(xpos)) <= 0
163: || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
164: register char xcurrent = CORD_pos_fetch(xpos);
165: register char ycurrent = CORD_pos_fetch(ypos);
166: if (xcurrent != ycurrent) return(xcurrent - ycurrent);
167: CORD_next(xpos);
168: CORD_next(ypos);
169: } else {
170: /* process as many characters as we can */
171: register int result;
172:
173: if (avail > yavail) avail = yavail;
174: result = strncmp(CORD_pos_cur_char_addr(xpos),
175: CORD_pos_cur_char_addr(ypos), avail);
176: if (result != 0) return(result);
177: CORD_pos_advance(xpos, avail);
178: CORD_pos_advance(ypos, avail);
179: }
180: }
181: }
182:
183: int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
184: {
185: CORD_pos xpos;
186: CORD_pos ypos;
187: register size_t count;
188: register long avail, yavail;
189:
190: CORD_set_pos(xpos, x, x_start);
191: CORD_set_pos(ypos, y, y_start);
192: for(count = 0; count < len;) {
193: if (!CORD_pos_valid(xpos)) {
194: if (CORD_pos_valid(ypos)) {
195: return(-1);
196: } else {
197: return(0);
198: }
199: }
200: if (!CORD_pos_valid(ypos)) {
201: return(1);
202: }
203: if ((avail = CORD_pos_chars_left(xpos)) <= 0
204: || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
205: register char xcurrent = CORD_pos_fetch(xpos);
206: register char ycurrent = CORD_pos_fetch(ypos);
207: if (xcurrent != ycurrent) return(xcurrent - ycurrent);
208: CORD_next(xpos);
209: CORD_next(ypos);
210: count++;
211: } else {
212: /* process as many characters as we can */
213: register int result;
214:
215: if (avail > yavail) avail = yavail;
216: count += avail;
217: if (count > len) avail -= (count - len);
218: result = strncmp(CORD_pos_cur_char_addr(xpos),
219: CORD_pos_cur_char_addr(ypos), (size_t)avail);
220: if (result != 0) return(result);
221: CORD_pos_advance(xpos, (size_t)avail);
222: CORD_pos_advance(ypos, (size_t)avail);
223: }
224: }
225: return(0);
226: }
227:
228: char * CORD_to_char_star(CORD x)
229: {
230: register size_t len = CORD_len(x);
231: char * result = GC_MALLOC_ATOMIC(len + 1);
232:
233: if (result == 0) OUT_OF_MEMORY;
234: CORD_fill_buf(x, 0, len, result);
235: result[len] = '\0';
236: return(result);
237: }
238:
239: CORD CORD_from_char_star(const char* s)
240: {
241: char * result;
242: size_t len = strlen(s);
243:
244: if (0 == len) return(CORD_EMPTY);
245: result = GC_MALLOC_ATOMIC(len + 1);
246: if (result == 0) OUT_OF_MEMORY;
247: memcpy(result, s, len+1);
248: return(result);
249: }
250:
251: const char* CORD_to_const_char_star(CORD x)
252: {
253: if (x == 0) return("");
254: if (CORD_IS_STRING(x)) return((const char* )x);
255: return(CORD_to_char_star(x));
256: }
257:
258: char CORD_fetch(CORD x, size_t i)
259: {
260: CORD_pos xpos;
261:
262: CORD_set_pos(xpos, x, i);
263: if (!CORD_pos_valid(xpos)) ABORT("bad index?");
264: return(CORD_pos_fetch(xpos));
265: }
266:
267:
268: int CORD_put_proc(char c, void * client_data)
269: {
270: register FILE * f = (FILE *)client_data;
271:
272: return(putc(c, f) == EOF);
273: }
274:
275: int CORD_batched_put_proc(const char* s, void * client_data)
276: {
277: register FILE * f = (FILE *)client_data;
278:
279: return(fputs(s, f) == EOF);
280: }
281:
282:
283: int CORD_put(CORD x, FILE * f)
284: {
285: if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
286: return(EOF);
287: } else {
288: return(1);
289: }
290: }
291:
292: typedef struct {
293: size_t pos; /* Current position in the cord */
294: char target; /* Character we're looking for */
295: } chr_data;
296:
297: int CORD_chr_proc(char c, void * client_data)
298: {
299: register chr_data * d = (chr_data *)client_data;
300:
301: if (c == d -> target) return(1);
302: (d -> pos) ++;
303: return(0);
304: }
1.4 paf 305:
1.2 paf 306: int CORD_rchr_proc(char c, void * client_data)
307: {
308: register chr_data * d = (chr_data *)client_data;
309:
310: if (c == d -> target) return(1);
311: (d -> pos) --;
312: return(0);
313: }
314:
315: int CORD_batched_chr_proc(const char* s, void * client_data)
316: {
317: register chr_data * d = (chr_data *)client_data;
318: register char * occ = strchr(s, d -> target);
319:
320: if (occ == 0) {
321: d -> pos += strlen(s);
322: return(0);
323: } else {
324: d -> pos += occ - s;
325: return(1);
326: }
327: }
328:
329: size_t CORD_chr(CORD x, size_t i, int c)
330: {
331: chr_data d;
332:
333: d.pos = i;
334: d.target = c;
335: if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
336: return(d.pos);
337: } else {
338: return(CORD_NOT_FOUND);
339: }
340: }
341:
342: size_t CORD_rchr(CORD x, size_t i, int c)
343: {
344: chr_data d;
345:
346: d.pos = i;
347: d.target = c;
348: if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
349: return(d.pos);
350: } else {
351: return(CORD_NOT_FOUND);
352: }
353: }
354:
355: /* Find the first occurrence of s in x at position start or later. */
356: /* This uses an asymptotically poor algorithm, which should typically */
357: /* perform acceptably. We compare the first few characters directly, */
358: /* and call CORD_ncmp whenever there is a partial match. */
359: /* This has the advantage that we allocate very little, or not at all. */
360: /* It's very fast if there are few close misses. */
361: size_t CORD_str(CORD x, size_t start, CORD s)
362: {
363: CORD_pos xpos;
364: size_t xlen = CORD_len(x);
365: size_t slen;
366: register size_t start_len;
367: const char* s_start;
368: unsigned long s_buf = 0; /* The first few characters of s */
369: unsigned long x_buf = 0; /* Start of candidate substring. */
370: /* Initialized only to make compilers */
371: /* happy. */
372: unsigned long mask = 0;
373: register size_t i;
374: register size_t match_pos;
375:
376: if (s == CORD_EMPTY) return(start);
377: if (CORD_IS_STRING(s)) {
378: s_start = s;
379: slen = strlen(s);
380: } else {
381: s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
382: slen = CORD_len(s);
383: }
384: if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
385: start_len = slen;
386: if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
387: CORD_set_pos(xpos, x, start);
388: for (i = 0; i < start_len; i++) {
389: mask <<= 8;
390: mask |= 0xff;
391: s_buf <<= 8;
1.3 paf 392: s_buf |= (unsigned char)s_start[i];
1.2 paf 393: x_buf <<= 8;
1.5 paf 394: x_buf |= (unsigned char)CORD_pos_fetch(xpos);
1.2 paf 395: CORD_next(xpos);
396: }
397: for (match_pos = start; ; match_pos++) {
398: if ((x_buf & mask) == s_buf) {
399: if (slen == start_len ||
400: CORD_ncmp(x, match_pos + start_len,
401: s, start_len, slen - start_len) == 0) {
402: return(match_pos);
403: }
404: }
405: if ( match_pos == xlen - slen ) {
406: return(CORD_NOT_FOUND);
407: }
408: x_buf <<= 8;
1.5 paf 409: x_buf |= (unsigned char)CORD_pos_fetch(xpos);
1.2 paf 410: CORD_next(xpos);
411: }
412: }
413:
414: void CORD_ec_flush_buf(CORD_ec x)
415: {
416: register size_t len = x[0].ec_bufptr - x[0].ec_buf;
417: char * s;
418:
419: if (len == 0) return;
420: s = GC_MALLOC_ATOMIC(len+1);
421: memcpy(s, x[0].ec_buf, len);
422: s[len] = '\0';
423: x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
424: x[0].ec_bufptr = x[0].ec_buf;
425: }
426:
427: void CORD_ec_append_cord(CORD_ec x, CORD s)
428: {
429: CORD_ec_flush_buf(x);
430: x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
431: }
432:
433: /*ARGSUSED*/
434: char CORD_nul_func(size_t i, void * client_data)
435: {
436: return((char)(unsigned long)client_data);
1.4 paf 437: }
438:
1.9 ! misha 439: #ifdef CORD_CHARS_CACHE
! 440: static char *cord_chars_cache[256][16]={0};
! 441: #endif
! 442:
1.2 paf 443: CORD CORD_chars(char c, size_t i)
444: {
1.8 misha 445: if (i>0 && i<16 /* SHORT_LIMIT */) {
1.9 ! misha 446: #ifdef CORD_CHARS_CACHE
! 447: if (cord_chars_cache[c][i]) return cord_chars_cache[c][i];
! 448: #endif
1.8 misha 449: register char* result;
450: result=GC_MALLOC_ATOMIC(i+1);
451: if(result==0) OUT_OF_MEMORY;
452: memset(result, c, i);
453: result[i] = '\0';
1.9 ! misha 454: #ifdef CORD_CHARS_CACHE
! 455: cord_chars_cache[c][i]=result;
! 456: #endif
1.8 misha 457: return((CORD) result);
458: } else {
459: return(CORD_from_fn(CORD_nul_func, (void *)(unsigned long)c, i));
460: }
1.2 paf 461: }
462:
463: CORD CORD_from_file_eager(FILE * f)
464: {
465: register int c;
466: CORD_ec ecord;
467:
468: CORD_ec_init(ecord);
469: for(;;) {
470: c = getc(f);
471: if (c == 0) {
472: /* Append the right number of NULs */
473: /* Note that any string of NULs is rpresented in 4 words, */
474: /* independent of its length. */
475: register size_t count = 1;
476:
477: CORD_ec_flush_buf(ecord);
478: while ((c = getc(f)) == 0) count++;
479: ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
480: }
481: if (c == EOF) break;
482: CORD_ec_append(ecord, c);
483: }
484: (void) fclose(f);
485: return(CORD_balance(CORD_ec_to_cord(ecord)));
486: }
487:
488: /* The state maintained for a lazily read file consists primarily */
489: /* of a large direct-mapped cache of previously read values. */
490: /* We could rely more on stdio buffering. That would have 2 */
491: /* disadvantages: */
492: /* 1) Empirically, not all fseek implementations preserve the */
493: /* buffer whenever they could. */
494: /* 2) It would fail if 2 different sections of a long cord */
495: /* were being read alternately. */
496: /* We do use the stdio buffer for read ahead. */
497: /* To guarantee thread safety in the presence of atomic pointer */
498: /* writes, cache lines are always replaced, and never modified in */
499: /* place. */
500:
501: # define LOG_CACHE_SZ 14
502: # define CACHE_SZ (1 << LOG_CACHE_SZ)
503: # define LOG_LINE_SZ 9
504: # define LINE_SZ (1 << LOG_LINE_SZ)
505:
506: typedef struct {
507: size_t tag;
508: char data[LINE_SZ];
509: /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */
510: } cache_line;
511:
512: typedef struct {
513: FILE * lf_file;
514: size_t lf_current; /* Current file pointer value */
515: cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
516: } lf_state;
517:
518: # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
519: # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
520: # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
521: # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
522: # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
523:
524: typedef struct {
525: lf_state * state;
526: size_t file_pos; /* Position of needed character. */
527: cache_line * new_cache;
528: } refill_data;
529:
530: /* Executed with allocation lock. */
531: static char refill_cache(client_data)
532: refill_data * client_data;
533: {
534: register lf_state * state = client_data -> state;
535: register size_t file_pos = client_data -> file_pos;
536: FILE *f = state -> lf_file;
537: size_t line_start = LINE_START(file_pos);
538: size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
539: cache_line * new_cache = client_data -> new_cache;
540:
541: if (line_start != state -> lf_current
542: && fseek(f, line_start, SEEK_SET) != 0) {
543: ABORT("fseek failed");
544: }
545: if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
546: <= file_pos - line_start) {
547: ABORT("fread failed");
548: }
549: new_cache -> tag = DIV_LINE_SZ(file_pos);
550: /* Store barrier goes here. */
551: ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
552: state -> lf_current = line_start + LINE_SZ;
553: return(new_cache->data[MOD_LINE_SZ(file_pos)]);
554: }
555:
556: char CORD_lf_func(size_t i, void * client_data)
557: {
558: register lf_state * state = (lf_state *)client_data;
559: register cache_line * volatile * cl_addr =
560: &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
561: register cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
562:
563: if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
564: /* Cache miss */
565: refill_data rd;
566:
567: rd.state = state;
568: rd.file_pos = i;
569: rd.new_cache = GC_NEW_ATOMIC(cache_line);
570: if (rd.new_cache == 0) OUT_OF_MEMORY;
571: return((char)(GC_word)
572: GC_call_with_alloc_lock((GC_fn_type) refill_cache, &rd));
573: }
574: return(cl -> data[MOD_LINE_SZ(i)]);
575: }
576:
577: /*ARGSUSED*/
578: void CORD_lf_close_proc(void * obj, void * client_data)
579: {
580: if (fclose(((lf_state *)obj) -> lf_file) != 0) {
581: ABORT("CORD_lf_close_proc: fclose failed");
582: }
583: }
584:
585: #ifndef PA_DEBUG_DISABLE_GC
586: CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
587: {
588: register lf_state * state = GC_NEW(lf_state);
589: register int i;
590:
591: if (state == 0) OUT_OF_MEMORY;
592: if (len != 0) {
593: /* Dummy read to force buffer allocation. */
594: /* This greatly increases the probability */
595: /* of avoiding deadlock if buffer allocation */
596: /* is redirected to GC_malloc and the */
597: /* world is multithreaded. */
598: char buf[1];
599:
600: (void) fread(buf, 1, 1, f);
601: rewind(f);
602: }
603: state -> lf_file = f;
604: for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
605: state -> lf_cache[i] = 0;
606: }
607: state -> lf_current = 0;
608: GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
609: return(CORD_from_fn(CORD_lf_func, state, len));
610: }
611:
612: CORD CORD_from_file_lazy(FILE * f)
613: {
614: register long len;
615:
616: if (fseek(f, 0l, SEEK_END) != 0) {
617: ABORT("Bad fd argument - fseek failed");
618: }
619: if ((len = ftell(f)) < 0) {
620: ABORT("Bad fd argument - ftell failed");
621: }
622: rewind(f);
623: return(CORD_from_file_lazy_inner(f, (size_t)len));
624: }
625:
626: # define LAZY_THRESHOLD (128*1024 + 1)
627:
628: CORD CORD_from_file(FILE * f)
629: {
630: register long len;
631:
632: if (fseek(f, 0l, SEEK_END) != 0) {
633: ABORT("Bad fd argument - fseek failed");
634: }
635: if ((len = ftell(f)) < 0) {
636: ABORT("Bad fd argument - ftell failed");
637: }
638: rewind(f);
639: if (len < LAZY_THRESHOLD) {
640: return(CORD_from_file_eager(f));
641: } else {
642: return(CORD_from_file_lazy_inner(f, (size_t)len));
643: }
644: }
645: #endif
E-mail: