cobol: Improve binary-to-string conversion.

COBOL often requires the conversion of binary integers to string of characters.
These changes replace a naive routine that peels decimal digits from a binary
value one digit at a time, with a divide-and-conquer algorithm that is twice as
fast even for a couple of digits, and is about eight times faster past ten
digits.

Included here are some minor fixes to the lexer and parser.

gcc/cobol/ChangeLog:

	* cbldiag.h (location_dump): Source code formatting.
	* parse.y: error_msg formatting.
	* scan.l: Remove UTF-8 character from regex pattern.
	* scan_ante.h (numstr_of): error_msg formatting.
	* show_parse.h (class ANALYZE): Suppress cppcheck error.
	* util.cc (cbl_field_t::report_invalid_initial_value):
	error_msg formatting.

libgcobol/ChangeLog:

	* Makefile.am: Include new stringbin.cc file.
	* Makefile.in: Regenerated.
	* libgcobol.cc (__gg__power_of_ten): Improve error message.
	(__gg__binary_to_string): Deleted.
	(__gg__binary_to_string_internal): Deleted.
	(int128_to_field): Use new conversion routine.
	(__gg__move): Use new conversion routine.
	* stringbin.cc: New file. Implements new conversion routine.
	* stringbin.h: New file. Likewise.
This commit is contained in:
Robert Dubner
2025-08-07 15:52:02 -04:00
parent 932b764be4
commit c684053fc0
11 changed files with 397 additions and 66 deletions

View File

@@ -122,8 +122,8 @@ static void
location_dump( const char func[], int line, const char tag[], const LOC& loc) {
extern int yy_flex_debug; // cppcheck-suppress shadowVariable
if( yy_flex_debug ) {
const char *detail = gcobol_getenv("update_location"); // cppcheck-suppress knownConditionTrueFalse
if( detail ) {
const char *detail = gcobol_getenv("update_location");
if( detail ) { // cppcheck-suppress knownConditionTrueFalse
fprintf(stderr, "%s:%d: %s location (%d,%d) to (%d,%d)\n",
func, line, tag,
loc.first_line, loc.first_column, loc.last_line, loc.last_column);

View File

@@ -10336,8 +10336,8 @@ intrinsic: function_udf
if( p != NULL ) {
auto loc = symbol_field_location(field_index(p->field));
error_msg(loc, "FUNCTION %qs has "
"inconsistent parameter type %td (%qs)",
keyword_str($1), p - args.data(), name_of(p->field) );
"inconsistent parameter type %ld (%qs)",
keyword_str($1), (long)(p - args.data()), name_of(p->field) );
YYERROR;
}
$$ = is_numeric(args[0].field)?

View File

@@ -123,7 +123,7 @@ NUMEDCHAR [BPVZ90/,]+{COUNT}?
NUMEDCHARS {NUMEDCHAR}([.]?{NUMEDCHAR})*
NUMED ([+-]{NUMEDCHARS}+)|({NUMEDCHARS}+[+-])
CURRENCY [A-Zfhijklmoqtuwy\x80-\xFF]{-}[ABCDEGNPRSVXZ]
NUMEDCUR (([.]?[-$0B/Z*+,P9()V+]|{CURRENCY}+|{COUNT})+([.][$0B/Z*+P9()V+\])*)+
NUMEDCUR (([.]?[$0B/Z*+,P9()V+-]|{CURRENCY}+|{COUNT})+([.][$0B/Z*+P9()V+-])*)+
NUMEDITED {NUMED}|{NUMEDCUR}
EDITED {ALPHED}|{NUMED}|{NUMEDCUR}

View File

@@ -149,7 +149,7 @@ numstr_of( const char string[], radix_t radix = decimal_e ) {
}
auto nx = std::count_if(input, p, fisdigit);
if( 36 < nx ) {
error_msg(yylloc, "significand of %s has more than 36 digits (%td)", input, nx);
error_msg(yylloc, "significand of %s has more than 36 digits (%ld)", input, (long)nx);
return NO_CONDITION;
}

View File

@@ -500,7 +500,7 @@ class ANALYZE
int level;
inline static int analyze_level=1;
public:
ANALYZE(const char *func_) : func(func_)
ANALYZE(const char *func_) : func(func_) // cppcheck-suppress noExplicitConstructor
{
level = 0;
if( getenv("Analyze") )

View File

@@ -1049,8 +1049,8 @@ cbl_field_t::report_invalid_initial_value(const YYLTYPE& loc) const {
return TOUPPER(ch) == 'E';
} );
if( !has_exponent && data.precision() < pend - p ) {
error_msg(loc, "%s cannot represent VALUE %qs exactly (max %c%td)",
name, data.initial, '.', pend - p);
error_msg(loc, "%s cannot represent VALUE %qs exactly (max %c%ld)",
name, data.initial, '.', (long)(pend - p));
}
}
}

View File

@@ -42,6 +42,7 @@ libgcobol_la_SOURCES = \
intrinsic.cc \
io.cc \
libgcobol.cc \
stringbin.cc \
valconv.cc
WARN_CFLAGS = -W -Wall -Wwrite-strings

View File

@@ -178,7 +178,7 @@ libgcobol_la_LIBADD =
@BUILD_LIBGCOBOL_TRUE@am_libgcobol_la_OBJECTS = charmaps.lo \
@BUILD_LIBGCOBOL_TRUE@ constants.lo gfileio.lo gmath.lo \
@BUILD_LIBGCOBOL_TRUE@ intrinsic.lo io.lo libgcobol.lo \
@BUILD_LIBGCOBOL_TRUE@ valconv.lo
@BUILD_LIBGCOBOL_TRUE@ stringbin.lo valconv.lo
libgcobol_la_OBJECTS = $(am_libgcobol_la_OBJECTS)
@BUILD_LIBGCOBOL_TRUE@am_libgcobol_la_rpath = -rpath $(toolexeclibdir)
AM_V_P = $(am__v_P_@AM_V@)
@@ -404,6 +404,7 @@ gcc_version := $(shell @get_gcc_base_ver@ $(top_srcdir)/../gcc/BASE-VER)
@BUILD_LIBGCOBOL_TRUE@ intrinsic.cc \
@BUILD_LIBGCOBOL_TRUE@ io.cc \
@BUILD_LIBGCOBOL_TRUE@ libgcobol.cc \
@BUILD_LIBGCOBOL_TRUE@ stringbin.cc \
@BUILD_LIBGCOBOL_TRUE@ valconv.cc
@BUILD_LIBGCOBOL_TRUE@WARN_CFLAGS = -W -Wall -Wwrite-strings
@@ -526,6 +527,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/intrinsic.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/io.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgcobol.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stringbin.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/valconv.Plo@am__quote@
.cc.o:

View File

@@ -72,6 +72,8 @@
#include <sys/time.h>
#include <execinfo.h>
#include "exceptl.h"
#include "stringbin.h"
/* BSD extension. */
#if !defined(LOG_PERROR)
@@ -798,7 +800,7 @@ __gg__power_of_ten(int n)
fprintf(stderr,
"Trying to raise 10 to %d as an int128, which we can't do.\n",
n);
fprintf(stderr, "The problem is in %s.\n", __func__);
fprintf(stderr, "The problem is in %s %s:%d.\n", __func__, __FILE__, __LINE__);
abort();
}
if( n <= MAX_POWER )
@@ -875,56 +877,6 @@ __gg__scale_by_power_of_ten_2(__int128 value, int N)
return value;
}
extern "C"
bool
__gg__binary_to_string(char *result, int digits, __int128 value)
{
// The result is not terminated, because this routine is used
// to put information directly into cblc_field_t::data
// Our caller has to keep track of whether value was negative.
// Note that this routine operates in the source code-set space; that is
// the result comes back with zero as an ASCII 0x30, not an EBCDIC 0xF0
if( value < 0 )
{
value = -value;
}
result += digits-1 ;
while( digits-- )
{
*result-- = value%10 + ascii_zero;
value /= 10;
}
// Should value be non-zero, it means we potentially have a size error
return value != 0;
}
extern "C"
bool
__gg__binary_to_string_internal(char *result, int digits, __int128 value)
{
// The result is not terminated, because this routine is used
// to put information directly into cblc_field_t::data
// Our caller has to keep track of whether value was negative.
// Note that this routine operates in the source code-set space; that is
// the result comes back with zero as an ASCII 0x30, not an EBCDIC 0xF0
if( value < 0 )
{
value = -value;
}
result += digits-1 ;
while( digits-- )
{
*result-- = (value%10) + internal_zero;
value /= 10;
}
// Should value be non-zero, it means we potentially have a size error
return value != 0;
}
static bool
value_is_too_big(const cblc_field_t *var,
__int128 value,
@@ -1617,9 +1569,13 @@ int128_to_field(cblc_field_t *var,
// Note that sending a signed value to an alphanumeric strips off
// any plus or minus signs.
memset(location, 0, length);
size_error = __gg__binary_to_string_internal(
PTRCAST(char, location),
length, value);
PTRCAST(char, location),
length > MAX_FIXED_POINT_DIGITS
? MAX_FIXED_POINT_DIGITS
: length,
value);
break;
case FldNumericDisplay:
@@ -1708,7 +1664,7 @@ int128_to_field(cblc_field_t *var,
// At this point, value is scaled to the target's rdigits
size_error = __gg__binary_to_string(ach, var->digits, value);
size_error = __gg__binary_to_string_ascii(ach, var->digits, value);
ach[var->digits] = NULLCH;
// Convert that string according to the PICTURE clause
@@ -1749,7 +1705,7 @@ int128_to_field(cblc_field_t *var,
case FldAlphaEdited:
{
char ach[128];
size_error = __gg__binary_to_string(ach, length, value);
size_error = __gg__binary_to_string_ascii(ach, length, value);
ach[length] = NULLCH;
// Convert that string according to the PICTURE clause
@@ -6126,7 +6082,7 @@ __gg__move( cblc_field_t *fdest,
// Convert it to the full complement of digits available
// from the source...but no more
__gg__binary_to_string(ach, source_digits, value);
__gg__binary_to_string_ascii(ach, source_digits, value);
// Binary to string returns ASCII characters:
for(int i=0; i<source_digits; i++)

330
libgcobol/stringbin.cc Normal file
View File

@@ -0,0 +1,330 @@
/*
* Copyright (c) 2021-2025 Symas Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of the Symas Corporation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <algorithm>
#include <cctype>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <set>
#include <stack>
#include <string>
#include <unordered_map>
#include <vector>
#include <dirent.h>
#include <dlfcn.h>
#include <err.h>
#include <fcntl.h>
#include <fenv.h>
#include <math.h> // required for fpclassify(3), not in cmath
#include <setjmp.h>
#include <signal.h>
#include <syslog.h>
#include <unistd.h>
#include <stdarg.h>
#if __has_include(<errno.h>)
# include <errno.h> // for program_invocation_short_name
#endif
#include "config.h"
#include "libgcobol-fp.h"
#include "ec.h"
#include "common-defs.h"
#include "io.h"
#include "gcobolio.h"
#include "libgcobol.h"
#include "gfileio.h"
#include "charmaps.h"
#include "valconv.h"
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <execinfo.h>
#include "exceptl.h"
#include "stringbin.h"
/* This routine evolved from a primitive binary-to-string routine that simply
peeled digits off the bottom of an __int128 by using
value % 10 + '0';
value /= 10;
That turns out to be unnecessarily slow.
The routine implemented here uses a divide-and-conquer approach to
minimimizing the number of operations, and when you get down to two
digits it does a divide-by-100 and uses the remainder in a table lookup
to get the digits. */
/* These static tables are born of a pathologic desire to avoid calculations.
Whether that paranoia is justified (perhaps "digit%10 + '0';" ) would
actually be faster) is currently untested. But I figured this would be
pretty darn fast.
Use them when you know the index is between zero and one hundred. */
static const char digit_low[100] =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
};
static const char digit_high[100] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
};
static char combined_string[128];
static char zero_char;
typedef struct
{
int start;
int run;
union
{
unsigned __int128 val128;
uint64_t val64;
uint32_t val32;
uint16_t val16;
uint8_t val8;
};
} COMBINED;
static
void
string_from_combined(const COMBINED &combined)
{
COMBINED left;
COMBINED right;
uint16_t v16;
switch(combined.run)
{
case 1:
// We know that val8 is a single digit
combined_string[combined.start] = combined.val8 + zero_char;;
break;
case 2:
// We know that val8 has two digits
combined_string[combined.start] = digit_high[combined.val8] + zero_char;
combined_string[combined.start+1] = digit_low [combined.val8] + zero_char;
break;
case 3:
// We know that val16 has three digits.
v16 = combined.val16;
combined_string[combined.start] = v16 / 100 + zero_char;
v16 %= 100;
combined_string[combined.start+1] = v16 / 10 + zero_char;
combined_string[combined.start+2] = v16 % 10 + zero_char;
break;
case 4:
// We know that val16 has four digits:
v16 = combined.val16;
combined_string[combined.start] = v16 / 1000 + zero_char;
v16 %= 1000;
combined_string[combined.start+1] = v16 / 100 + zero_char;
v16 %= 100;
combined_string[combined.start+2] = v16 / 10 + zero_char;
combined_string[combined.start+3] = v16 % 10 + zero_char;
break;
case 5:
case 6:
case 7:
case 8:
// We know that val32 can be treated as two 4-digit pieces
left.start = combined.start;
left.run = combined.run - 4;
left.val16 = combined.val32 / 10000;
right.start = combined.start+left.run;
right.run = 4;
right.val16 = combined.val32 % 10000;
string_from_combined(left);
string_from_combined(right);
break;
case 9:
// We break val32 into a 1-digit piece, and an 8-digit piece:
left.start = combined.start;
left.run = combined.run - 8;
left.val32 = combined.val32 / 100000000;
right.start = combined.start+left.run;
right.run = 8;
right.val32 = combined.val32 % 100000000;
string_from_combined(left);
string_from_combined(right);
break;
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
// We know we can treat val64 as two 9-digit pieces:
left.start = combined.start;
left.run = combined.run - 9;
left.val32 = combined.val64 / 1000000000;
right.start = combined.start+left.run;
right.run = 9;
right.val32 = combined.val64 % 1000000000;
string_from_combined(left);
string_from_combined(right);
break;
case 19:
// We split off the bottom nine digits
left.start = combined.start;
left.run = combined.run - 9;
left.val64 = combined.val64 / 1000000000;
right.start = combined.start+left.run;
right.run = 9;
right.val32 = combined.val64 % 1000000000;
string_from_combined(left);
string_from_combined(right);
break;
default:
// For twenty or more digits we peel eighteen digits at a time off the
// right side:
left.start = combined.start;
left.run = combined.run - 18;
left.val128 = combined.val128 / 1000000000000000000ULL;
right.start = combined.start+left.run;
right.run = 18;
right.val64 = combined.val128 % 1000000000000000000ULL;
string_from_combined(left);
string_from_combined(right);
break;
}
}
bool
__gg__binary_to_string_ascii(char *result, int digits, __int128 value)
{
zero_char = ascii_zero;
// Note that this routine does not terminate the generated string with a
// NUL. This routine is sometimes used to generate a NumericDisplay string
// of digits in place, with no terminator.
__int128 mask = __gg__power_of_ten(digits);
COMBINED combined;
if( value < 0 )
{
value = -value;
}
// A non-zero retval means the number was too big to fit into the desired
// number of digits:
bool retval = !!(value / mask);
// mask off the bottom digits to avoid garbage when value is too large
value %= mask;
combined.start = 0;
combined.run = digits;
combined.val128 = value;
string_from_combined(combined);
memcpy(result, combined_string, digits);
return retval;
}
bool
__gg__binary_to_string_internal(char *result, int digits, __int128 value)
{
zero_char = internal_zero;
// Note that this routine does not terminate the generated string with a
// NUL. This routine is sometimes used to generate a NumericDisplay string
// of digits in place, with no terminator.
__int128 mask = __gg__power_of_ten(digits);
COMBINED combined;
if( value < 0 )
{
value = -value;
}
// A non-zero retval means the number was too big to fit into the desired
// number of digits:
bool retval = !!(value / mask);
// mask off the bottom digits to avoid garbage when value is too large
value %= mask;
combined.start = 0;
combined.run = digits;
combined.val128 = value;
string_from_combined(combined);
memcpy(result, combined_string, digits);
return retval;
}

42
libgcobol/stringbin.h Normal file
View File

@@ -0,0 +1,42 @@
/*
* Copyright (c) 2021-2025 Symas Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of the Symas Corporation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef STRINGBIN_H_
#define STRINGBIN_H_
extern "C"
bool __gg__binary_to_string_ascii(char *result,
int digits,
__int128 value);
extern "C"
bool __gg__binary_to_string_internal( char *result,
int digits,
__int128 value);
#endif