Skip to content
Snippets Groups Projects
scan_ante.h 16.1 KiB
Newer Older
/*
 * Copyright (c) 2021-2023 Symas Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following disclaimer
 *   in the documentation and/or other materials provided with the
 *   distribution.
 * * Neither the name of the Symas Corporation nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <stdint.h>
#include <string.h>
#include <syslog.h>

#include <sys/types.h>
#include <sys/wait.h>

#include "parse.h"
#include "cdf.h"
#include "symbols.h"
#include "copybook.h"

/*
 * Flex override
 */
static void /* yynoreturn */ yy_fatal_error ( const char* msg  );

static void inline
die_fatal_error( const char msg[] ) {
  yyerrorv("scan.o: %s",  msg); 
  yy_fatal_error(msg);
}

#define YY_FATAL_ERROR(msg) die_fatal_error((msg))

/*
 * External functions
 */

void yyerror( char const *s, int error_level = LOG_ERR );
void yyerrorv( const char fmt[], ... );

void parser_enter_file(const char *filename);
void parser_leave_file();

bool is_fixed_format();
bool include_debug();
int lexer_input( char buf[], int max_size, FILE *input );

const char * keyword_str( int token );
const char * cdf_token_str( int token );

int intrinsic_repository_tok( const char name[] );

void cobol_set_indicator_column( int column );

void next_sentence_label(cbl_label_t*);

int repeat_count( const char picture[] );

size_t program_level();

int ydfparse(void);

FILE * copy_mode_start();

/*
 * Public functions and data
 */

cbl_label_t *next_sentence;

static bool echo_on = false;

void
lexer_echo( bool tf ) {
  echo_on = tf;
}

bool
lexer_echo() {
  return echo_on;
}

// IBM says a picture can be up to 50 bytes, not 1000 words.
// ISO says a picture can be up to 63 bytes.  We allow for a NUL terminator.
static char orig_picture[PICTURE_MAX];
static char orig_number[80];

const char *
original_picture() {
  const char *out = strdup(orig_picture);
  assert(orig_picture[0] != '\0');
  return out;
}

char *
original_number( char input[] = NULL ) {
  if( input ) {
    if(sizeof(orig_number) < strlen(input) ) return NULL;
    strcpy(orig_number, input);
    return input;
  }
  char *out = strdup(orig_number);
  assert(orig_number[0] != '\0');
  return out;
}

static bool need_level = true;

void field_done() { orig_picture[0] = '\0'; need_level = true; }

/*
 * Local functions
 */

static inline int numstr_of( const char string[], radix_t radix = decimal_e ) {
  yylval.numstr.radix = radix;
  ydflval.string = yylval.numstr.string = strdup(string);
  char *comma = strchr(yylval.numstr.string, ',');
  if( comma && comma[1] == '\0' ) *comma = '\0';
  if( ! original_number(yylval.numstr.string) ) {
    yyerrorv("error: input inconceivably long");
    return NO_CONDITION;
  }

  const char *input = yylval.numstr.string;
  auto eoinput = input + strlen(input);
  auto p = std::find_if( input, eoinput,
			 []( char ch ) { return ch == 'e' || ch == 'E';} );

  if( p < eoinput ) {
    if( eoinput == std::find(input, eoinput, symbol_decimal_point()) ) {
      // no decimal point: 1E0 is a valid user-defined name
      ydflval.string = yylval.string = yylval.numstr.string;
      return NAME;
    }
    assert(input < p);
    // "The literal to the left of the 'E' represents the significand. It may
    //  be signed and shall include a decimal point. The significand shall be
    //  from 1 to 36 digits in length."
    if( p == std::find(input, p, symbol_decimal_point()) ) {
      return NO_CONDITION;
    }
    auto nx = std::count_if(input, p, isdigit);
    if( 36 < nx ) {
      yyerrorv("error: significand of %s has more than 36 digits (%zu)", input, nx);
      return NO_CONDITION;
    }

    // "The literal to the right of the 'E' represents the exponent. It may be
    //  signed and shall have a maximum of four digits and no decimal point. "
    // "The maximum permitted value and minimum permitted value of the
    //  exponent is implementor-defined." (We allow 9999.)
    nx = std::count_if(p, eoinput, isdigit);
    if( 4 < nx ) {
      yyerrorv("error: exponent %s more than 4 digits", ++p);
      return NO_CONDITION;
    }
    if( eoinput != std::find(p, eoinput, symbol_decimal_point()) ) {
      yyerrorv("error: exponent includes decimal point", ++p);
      return NO_CONDITION;
    }

    // "If all the digits in the significand are zero, then all the digits of
    //  the exponent shall also be zero and neither significand nor exponent
    //  shall have a negative sign."
    bool zero_signficand = std::all_of( input, p,
					[]( char ch ) {
					  return !isdigit(ch) || ch == '0'; } );
    if( zero_signficand ) {
      if( p != std::find(input, p, '-') ) {
	yyerrorv("error: zero significand of %s "
		 "cannot be negative", input);
	return NO_CONDITION;
      }
      if( eoinput != std::find(p, eoinput, '-') ) {
	yyerrorv("error: exponent of zero significand of %s "
		 "cannot be negative", input);
	return NO_CONDITION;
      }
    }
  }
  if( 1 < std::count(input, eoinput, symbol_decimal_point()) ) {
    yyerrorv("error: invalid numeric literal", ++p);
    return NO_CONDITION;
  }

  return NUMSTR;
}

static char *
null_trim( char name[] ) {
  auto p = std::find_if( name, name + strlen(name), isspace );
  if( p < name + strlen(name) ) *p = '\0';
  return name;
}

/*
 * CDF management
 */
static int final_token, penultimate_token;
static int separator = '\0';

struct cdf_status_t {
  int  token;
  bool lexing;
  cdf_status_t( int token = 0, bool lexing = true )
    : token(token), lexing(lexing)
  {}
  bool toggle() { return lexing = ! lexing; }
};

static inline const char *
boolalpha( bool tf ) { return tf? "True" : "False"; }

/*
 * Scanning status is true if tokens are being parsed and false if not (because
 * CDF is skipping some code).  Because CDF status is nested, status is true
 * only if the whole stack is true. That is, if B is stacked on A, and A is
 * false, then all of B is skipped, regardless of >>IF and >>ELSE for B.
*/

static class lexing_status_t : public std::stack<cdf_status_t> {
 public:
  int (*parser)(void) = yyparse;

  bool on() const { // true only if all true
    bool lexing = std::all_of( c.begin(), c.end(),
			       []( const auto& status ) { return status.lexing; } );
    return lexing;
  }

  bool feed_the_parser() const {
    return on() || parser == ydfparse;
  }

} lexing;

static int scanner_token() {
  if( lexing.empty() ) {
    yyerror("error: >>ELSE or >>END-IF without >>IF");
    return NO_CONDITION;
  }
  return lexing.top().token;
}

bool scanner_lexing() { return lexing.on(); }
void scanner_lexing( int token, bool tf ) {
  lexing.push( cdf_status_t(token, tf) );
  if( yydebug )
    warnx("%s @ %d: %s: scanning now %s, depth %zu", __func__, yylineno,
	  keyword_str(token), boolalpha(lexing.on()), lexing.size());
}
void scanner_lexing_toggle() {
  if( lexing.empty() ) {
    yyerror("error: >>ELSE without >>IF");
    return;
  }
  lexing.top().toggle();
  if( yydebug ) warnx("%s @ %d: scanning now %s", __func__, yylineno,
		      boolalpha(lexing.on()));
}
void scanner_lexing_pop() {
  if( lexing.empty() ) {
    yyerror("error: >>END-IF without >>IF");
    return;
  }
  lexing.pop();
  if( yydebug ) warnx("%s @ %d: scanning now %s, depth %zu", __func__, yylineno,
		      boolalpha(lexing.on()), lexing.size());
}


/*
 * The penultimate token is used to identify paragraph names in the presence of
 * directives, which are logical whitespace.  It is the next-to-last token
 * returned by the lexer. If the current token is NAME and is followed by a
 * dot, and the prior token was a dot, then the apparent NAME is really a
 * PARAGRAPH name.
 *
 * CDF directives (and #FILE push/pop) set and clear the penultimate token
 * because, as they lex, they change the most recently seen tokens.  When the
 * CDF parser finishes, it clears the penultimate token.  For #FILE, there's no
 * lexing, so the penultimate token is set only once, in case it's a dot and
 * the very next token is a NAME.
 */
static bool penultimate_clear;

static char
penultimate_set( int line, char token, bool once ) {
  penultimate_clear = once;

  if( penultimate_token == token ) {
    return token;
  }
  char output = penultimate_token;
  penultimate_token = token;
  if( penultimate_clear && yy_flex_debug ) {
    warnx( "scan.l:%d: changed penultimate_token from '%c' to '%c'",
	   line, output, token );
  }
  return output;
}
#define penultimate(T) penultimate_set(__LINE__, (T), false)
#define penultimate_once(T) penultimate_set(__LINE__, (T), true)

static void inject_token( int token ) { separator = token; }

#define YY_DECL int lexer(void)

#define YY_USER_ACTION					\
  yylloc.first_line = yylloc.last_line = yylineno;	\
  if( separator != '\0') {				\
    int sep = separator;				\
    separator = '\0';					\
    return sep;						\
  }

# define YY_INPUT(buf, result, max_size)			\
{								\
  if( 0 == (result = lexer_input(buf, max_size, yyin)) )	\
    result = YY_NULL;						\
}

#define scomputable(T, C)				\
    yylval.computational.type=T,			\
    yylval.computational.capacity=C,			\
    yylval.computational.signable=true, COMPUTATIONAL
#define ucomputable(T, C)				\
    yylval.computational.type=T,			\
    yylval.computational.capacity=C,			\
    yylval.computational.signable=false, COMPUTATIONAL

static char *tmpstring = NULL;

#define PROGRAM current_program_index()

static uint32_t
level_of( const char input[] ) {
  unsigned int output = 0;

  if( input[0] == '0' ) input++;

  if( 1 != sscanf(input, "%u", &output) ) {
    warnx( "%s:%d: invalid level '%s'", __func__, __LINE__, input );
  }

  return output;
}

static inline int
ndigit(int len) {
  char *input = toupper(yytext[0]) == 'V'? yytext + 1 : yytext;
  int n = repeat_count(input);
  return n == -1? len : n;
picset( int token ) {
  static const char * const eop = orig_picture + sizeof(orig_picture);
  char *p = orig_picture + strlen(orig_picture);

  if( eop < p + yyleng ) {
    yyerrorv("PICTURE exceeds maximum size of %zu bytes", sizeof(orig_picture) - 1);
  }
  snprintf( p, eop - p, "%s", yytext );
  return token;
}

static inline bool
is_integer_token(void) {
  int v, n = 0;
  return 1 == sscanf(yytext, "%d%n", &v, &n) && n == yyleng;
}

static bool need_nume = false;
bool need_nume_set( bool tf ) {
  if( yydebug ) warnx( "need_nume now %s", tf? "true" : "false" );
  return need_nume = tf;
}

static int datetime_format_of( const char input[] );

static int symbol_function_token( const char name[] ) {
  auto e = symbol_function( 0, name );
  return e ? symbol_index(e) : 0;
}

static symbol_elem_t *
symbol_exists( const char name[] ) {
  typedef std::map <std::string, size_t> name_cache_t;
  static std::map <size_t, name_cache_t> cachemap;

  cbl_name_t lname;
  std::transform( name, name + strlen(name) + 1, lname, tolower );
  auto& cache = cachemap[PROGRAM];
  auto p = cache.find(lname);
  
  if( p == cache.end() ) {
    symbol_elem_t * e = symbol_field( PROGRAM, 0, name );
    if( !e ) return NULL;
    cache[lname] = symbol_index(e);
    return e;
  }

  size_t isym = p->second;
  return symbol_at(cache[lname] = isym);
}

static int
typed_name( const char name[] ) {
  if( 0 == PROGRAM ) return NAME;
  if( need_nume ) { need_nume_set(false); return NUME; }

  int token = intrinsic_repository_tok(name);
  if( token ) { return token; }

  struct symbol_elem_t *e = symbol_special( PROGRAM, name );
  if( e ) return  cbl_special_name_of(e)->token;

  e = symbol_exists( name );

  auto type = e && e->type == SymField? cbl_field_of(e)->type : FldInvalid;

  switch(type) {
  case FldLiteralA:
    {
      auto f = cbl_field_of(e);
      if( is_constant(f) ) {
	int token = datetime_format_of(f->data.initial);
	if( token ) {
	  yylval.string = strdup(f->data.initial);
	  return token;
	}
      }
    }
    __attribute__((fallthrough));
  case FldLiteralN:
    {
      auto f = cbl_field_of(e);
      if( false && 0 == (f->attr & constant_e) ) {
	yyerrorv("%s: logic error: %s is not constant", __func__, name);
      }
      if( type == FldLiteralN ) {
	yylval.numstr.radix =
	  f->has_attr(hex_encoded_e)? hexadecimal_e : decimal_e;
	yylval.numstr.string = strdup(f->data.initial);
	return NUMSTR;
      }
      if( !f->has_attr(record_key_e) ) { // not a key-name literal
	yylval.literal.set(f);
	ydflval.string = yylval.literal.data;
	return LITERAL;
      }
    }
    __attribute__((fallthrough));
  case FldInvalid:
  case FldGroup:
  case FldForward:
  case FldIndex:
  case FldAlphanumeric:
  case FldPacked:
  case FldNumericDisplay:
  case FldNumericEdited:
  case FldAlphaEdited:
  case FldNumericBinary:
  case FldFloat:
  case FldNumericBin5:
  case FldPointer:
    return NAME;
  case FldSwitch:
    return SWITCH;
  case FldClass:
    return cbl_field_of(e)->level == 88? NAME88 : CLASS_NAME;
    break;
  default:
    warnx("%s:%d: invalid symbol type %s for symbol \"%s\"",
	  __func__, __LINE__, cbl_field_type_str(type), name);
    return NAME;
  }
  return cbl_field_of(e)->level == 88? NAME88 : NAME;
}

static char *
tmpstring_append( int len ) {
  char *s;
  const char *extant = tmpstring == NULL ? "" : tmpstring;

  if( -1 == asprintf(&s, "%s%.*s", extant, len, yytext) ) {
    err(EXIT_FAILURE, "unable to tokenize '%s'", yytext);
  }
  free(tmpstring);
  if( getenv(__func__) ) {
    warnx("%s: value is now '%s'", __func__, s);
  }
  return tmpstring = s;
}

#define pop_return yy_pop_state(); return

static bool
wait_for_the_child(void) {
  pid_t pid;
  int status;

  if( (pid = wait(&status)) == -1 ) {
    warn("internal error: no pending child CDF parser process");
    return false;
  }

  if( WIFSIGNALED(status) ) {
    warnx( "process %d terminated by %s", pid, strsignal(WTERMSIG(status)) );
    return false;
  }
  if( WIFEXITED(status) ) {
    if( WEXITSTATUS(status) != 0 ) {
      warnx("process %d exited with status %d", pid, status);
      return false;
    }
  }
  if( yy_flex_debug ) {
    warnx("process %d exited with status %d", pid, status);
  }
  return true;
}

static bool is_not = false;

static uint64_t
integer_of( const char input[], bool is_hex = false) {
  uint64_t output = 0;
  const char *fmt = is_hex? "%ul" : "%hl";

  if( input[0] == '0' ) input++;

  if( 1 != sscanf(input, fmt, &output) ) {
    warnx( "%s:%d: invalid integer '%s'", __func__, __LINE__, input );
  }

  return output;
}

#if 0
static char *
radix_10( const char input[] ) {
  assert(toupper(input[0]) == 'X');
  char *input2 = strdup(input + 2);
  std::replace(input2, input2 + strlen(input2), '\"', '\'');
  char *p = strchr(input2, '\'');
  assert(p);
  *p = '\0';

  uint64_t value = integer_of(input2);

  if( -1 == asprintf( &p, "%ld", value) ) {
    warnx("%s:%d: failed to make a string of %ld", __func__, __LINE__, value);
  }
  free(input2);
  return p;
}

static bool
might_be(void) {
  assert(yyleng > 0);
  for( char *name = yytext + yyleng  - 1; name > yytext; name-- ) {
    if( isspace(*name) ) {
	symbol_elem_t *e =  symbol_exists(++name);
      // TRUE if the input names a field that isn't a level 88 or switch
      return e && e->type == SymField &&
	cbl_field_of(e)->type != FldSwitch && cbl_field_of(e)->level != 88;
    }
  }
  return false;
}

#endif