mercury/library/io.text_read.m

%---------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%---------------------------------------------------------------------------%
% Copyright (C) 1993-2012 The University of Melbourne.
% Copyright (C) 2013-2022 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%---------------------------------------------------------------------------%
%
% File: io.text_read.m.
%
% This module implements the predicates in io.m that read
% words, lines and files.
%
%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%

:- module io.text_read.
:- interface.

:- import_module bitmap.
:- import_module list.

%---------------------------------------------------------------------------%

:- pred read_word_2(io.text_input_stream::in, io.result(list(char))::out,
    io::di, io::uo) is det.

%---------------------%

:- pred read_line_2(io.text_input_stream::in, result_code::out,
    system_error::out, list(char)::out, io::di, io::uo) is det.

%---------------------%

:- type read_line_as_string_result
    --->    rlas_ok
    ;       rlas_eof
    ;       rlas_null_char
    ;       rlas_error.

:- pred read_line_as_string_2(io.stream::in, bool::in,
    read_line_as_string_result::out, system_error::out, string::out,
    io::di, io::uo) is det.

%---------------------%

:- pred read_file_as_string_2(stream::in, string::out, int::out,
    system_error::out, bool::out, io::di, io::uo) is det.

%---------------------%

:- pred read_binary_file_as_bitmap_2(io.binary_input_stream::in,
    io.res(bitmap)::out, io::di, io::uo) is det.

%---------------------------------------------------------------------------%

:- implementation.

:- import_module array.
:- import_module char.
:- import_module int.
:- import_module int64.
:- import_module io.primitives_read.

%---------------------------------------------------------------------------%

read_word_2(Stream, Result, !IO) :-
    read_char(Stream, CharResult, !IO),
    (
        CharResult = error(Error),
        Result = error(Error)
    ;
        CharResult = eof,
        Result = eof
    ;
        CharResult = ok(Char),
        ( if char.is_whitespace(Char) then
            putback_char(Stream, Char, !IO),
            Result = ok([])
        else
            read_word_2(Stream, Result0, !IO),
            (
                Result0 = ok(Chars),
                Result = ok([Char | Chars])
            ;
                Result0 = error(_),
                Result = Result0
            ;
                Result0 = eof,
                Result = ok([Char])
            )
        )
    ).

%---------------------------------------------------------------------------%

read_line_2(Stream, Result, Error, Chars, !IO) :-
    read_char_code(Stream, Result0, Error0, Char, !IO),
    (
        Result0 = result_code_ok,
        ( if Char = '\n' then
            Result = result_code_ok,
            Chars = [Char],
            Error = Error0
        else
            read_line_2(Stream, Result, Error, CharsTail, !IO),
            Chars = [Char | CharsTail] % lcmc
        )
    ;
        ( Result0 = result_code_eof
        ; Result0 = result_code_error
        ),
        Result = Result0,
        Chars = [],
        Error = Error0
    ).

%---------------------------------------------------------------------------%

:- pragma foreign_export_enum("C", read_line_as_string_result/0,
    [prefix("ML_"), uppercase]).
:- pragma foreign_export_enum("Java", read_line_as_string_result/0,
    [prefix("ML_"), uppercase]).

:- pragma foreign_proc("C",
    read_line_as_string_2(Stream::in, _FirstCall::in, Res::out, Error::out,
        RetString::out, _IO0::di, _IO::uo),
    [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
        does_not_affect_liveness, no_sharing],
"
#define ML_IO_READ_LINE_GROW(n) ((n) * 3 / 2)
#define ML_IO_BYTES_TO_WORDS(n) (((n) + sizeof(MR_Word) - 1) / sizeof(MR_Word))
#define ML_IO_READ_LINE_START   1024

    char initial_read_buffer[ML_IO_READ_LINE_START];
    char *read_buffer = initial_read_buffer;
    size_t read_buf_size = ML_IO_READ_LINE_START;
    size_t i;
    int char_code = '\\0';

    Res = ML_RLAS_OK;
    Error = 0;
    for (i = 0; char_code != '\\n'; ) {
        char_code = mercury_get_byte(Stream);
        if (char_code == EOF) {
            if (i == 0) {
                if (MR_FERROR(*Stream)) {
                    Res = ML_RLAS_ERROR;
                    Error = errno;
                } else {
                    Res = ML_RLAS_EOF;
                }
            }
            break;
        }
        if (char_code == 0) {
            Res = ML_RLAS_NULL_CHAR;
            break;
        }
        read_buffer[i++] = (char) char_code;
        MR_assert(i <= read_buf_size);
        if (i == read_buf_size) {
            // Grow the read buffer.
            read_buf_size = ML_IO_READ_LINE_GROW(read_buf_size);
            if (read_buffer == initial_read_buffer) {
                read_buffer = MR_NEW_ARRAY(char, read_buf_size);
                MR_memcpy(read_buffer, initial_read_buffer,
                    ML_IO_READ_LINE_START);
            } else {
                read_buffer = MR_RESIZE_ARRAY(read_buffer, char,
                    read_buf_size);
            }
        }
    }
    if (Res == ML_RLAS_OK) {
        MR_Word ret_string_word;
        MR_offset_incr_hp_atomic_msg(ret_string_word,
            0, ML_IO_BYTES_TO_WORDS((i + 1) * sizeof(char)),
            MR_ALLOC_ID, ""string.string/0"");
        RetString = (MR_String) ret_string_word;
        MR_memcpy(RetString, read_buffer, i * sizeof(char));
        RetString[i] = '\\0';
    } else {
        RetString = MR_make_string_const("""");
    }
    if (read_buffer != initial_read_buffer) {
        MR_free(read_buffer);
    }
").

:- pragma foreign_proc("Java",
    read_line_as_string_2(Stream::in, _FirstCall::in, Res::out, Error::out,
        RetString::out, _IO0::di, _IO::uo),
    [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
        does_not_affect_liveness, may_not_duplicate],
"
    try {
        RetString =
            ((jmercury.io__stream_ops.MR_TextInputFile) Stream).read_line();
        if (RetString != null) {
            Res = ML_RLAS_OK;
        } else {
            Res = ML_RLAS_EOF;
        }
        Error = null;
    } catch (java.io.IOException e) {
        Res = ML_RLAS_ERROR;
        RetString = """";
        Error = e;
    }
").

read_line_as_string_2(Stream, FirstCall, Res, Error, String, !IO) :-
    % XXX This is terribly inefficient, a better approach would be
    % to use a buffer like what is done for io.read_file_as_string.
    read_char_code(text_input_stream(Stream), ResultCode, Error0, Char, !IO),
    (
        ResultCode = result_code_ok,
        ( if Char = '\n' then
            Res = rlas_ok,
            String = "\n",
            Error = Error0
        else if char.to_int(Char, 0) then
            Res = rlas_null_char,
            String = "",
            Error = Error0
        else
            read_line_as_string_2(Stream, no, Res, Error, String0, !IO),
            string.first_char(String, Char, String0)
        )
    ;
        ResultCode = result_code_eof,
        (
            FirstCall = yes,
            Res = rlas_eof
        ;
            FirstCall = no,
            Res = rlas_ok
        ),
        String = "",
        Error = Error0
    ;
        ResultCode = result_code_error,
        Res = rlas_error,
        String = "",
        Error = Error0
    ).

%---------------------------------------------------------------------------%

:- pragma foreign_proc("Java",
    read_file_as_string_2(Stream::in, String::out, NumCUs::out,
        Error::out, NullCharError::out, _IO0::di, _IO::uo),
    [will_not_call_mercury, promise_pure, thread_safe, tabled_for_io],
"
    StringBuilder sb = new StringBuilder();
    try {
        ((jmercury.io__stream_ops.MR_TextInputFile) Stream).read_file(sb);
        Error = null;
    } catch (java.io.IOException e) {
        Error = e;
    }
    String = sb.toString();
    NumCUs = String.length();
    NullCharError = bool.NO;
").

read_file_as_string_2(Stream, Str, NumCUs, Error, NullCharError, !IO) :-
    % Check if the stream is a regular file; if so, allocate a buffer
    % according to the size of the file. Otherwise, just use a default buffer
    % size of 4k minus a bit (to give malloc some room).
    input_stream_file_size(text_input_stream(Stream), FileSize, !IO),
    ( if FileSize >= 0 then
        % When targeting C, this reserves just enough space for all the bytes
        % in the file, plus the final NUL character.
        %
        % When targeting C#, this reserves one slot in an array of code points
        % for each byte in the file, plus the NUL. This means that the buffer
        % we reserve may be bigger than needed. How much bigger depends on
        % the number of code points in the file that take more than one
        % UTF-16 code units.
        BufferSize0 = FileSize + 1
    else
        BufferSize0 = 4000
    ),
    alloc_buffer(BufferSize0, Buffer0),
    % Read the file into the buffer (resizing it as we go if necessary),
    % convert the buffer into a string, and see if anything went wrong.
    %
    % When targeting C, Pos counts UTF-8 code *units* (in the usual case
    % where the input is valid UTF-8; otherwise, it counts bytes).
    % When targeting C#, Pos counts code *points*.
    % When targeting Java, the foreign_proc above replaces this clause.
    Pos0 = 0,
    read_file_as_string_loop(text_input_stream(Stream), Buffer0, BufferSize0,
        Pos0, Str, NumCUs, Error, NullCharError, !IO).

:- pred read_file_as_string_loop(text_input_stream::in, buffer::buffer_di,
    int::in, int::in, string::out, int::out, system_error::out, bool::out,
    io::di, io::uo) is det.
% This predicate is not used when compiling to Java; this pragma avoids
% a warning even in that case.
:- pragma consider_used(pred(read_file_as_string_loop/10)).

read_file_as_string_loop(Stream, !.Buffer, BufferSize0, !.Pos,
        Str, NumCUs, Error, NullCharError, !IO) :-
    Stream = text_input_stream(RealStream),
    read_into_buffer(RealStream, !Buffer, BufferSize0, !Pos, Error0, !IO),
    ( if !.Pos < BufferSize0 then
        % Buffer is not full: end-of-file or error.
        ( if
            buffer_and_pos_to_string_and_length(!.Buffer, !.Pos,
                StrPrime, NumCUsPrime)
        then
            Str = StrPrime,
            NumCUs = NumCUsPrime,
            NullCharError = no
        else
            Str = "",
            NumCUs = 0,
            NullCharError = yes
        ),
        Error = Error0
    else if !.Pos = BufferSize0 then
        % Buffer is full; make room for more of the file.
        % Doubling its size should catch up to its actual size quickly.
        BufferSize1 = BufferSize0 * 2,
        resize_buffer(BufferSize0, BufferSize1, !Buffer),
        read_file_as_string_loop(Stream, !.Buffer, BufferSize1, !.Pos,
            Str, NumCUs, Error, NullCharError, !IO)
    else
        error("io.read_file_as_string: buffer overflow")
    ).

%---------------------%

read_binary_file_as_bitmap_2(Stream, Result, !IO) :-
    % Check if the stream is a regular file; if so, allocate a buffer
    % according to the size of the file. Otherwise, just use a default buffer
    % size of 4k minus a bit (to give malloc some room).
    binary_input_stream_file_size(Stream, FileSize, !IO),
    ( if FileSize >= 0i64 then
        binary_input_stream_offset64(Stream, CurrentOffset, !IO),
        RemainingSizeInt64 = FileSize - CurrentOffset,
        ( if
            int.bits_per_int = 32,
            RemainingSizeInt64 > int64.from_int(int.max_int)
        then
            Result = error(io_error_string("io.read_binary_file_as_bitmap: " ++
                "file size exceeds maximum buffer size"))
        else
            RemainingSize = int64.cast_to_int(RemainingSizeInt64),
            some [!BM] (
                !:BM = bitmap.init(RemainingSize * bits_per_byte),
                ( if RemainingSize = 0 then
                    Result = ok(!.BM)
                else
                    bitmap.read_bitmap_range(Stream, 0, RemainingSize, !BM,
                        BytesRead, ReadResult, !IO),
                    (
                        ReadResult = ok,
                        ( if BytesRead = RemainingSize then
                            Result = ok(!.BM)
                        else
                            Result = error(io_error_string(
                                "io.read_binary_file_as_bitmap: " ++
                                "incorrect file size"))
                        )
                    ;
                        ReadResult = error(Msg),
                        Result = error(Msg)
                    )
                )
            )
        )
    else
        BufferSize = 4000,
        read_binary_file_as_bitmap_from_stream(Stream, BufferSize,
            Res, [], RevBitmaps, !IO),
        (
            Res = ok,
            Result = ok(bitmap.append_list(reverse(RevBitmaps)))
        ;
            Res = error(Msg),
            Result = error(Msg)
        )
    ).

:- pred read_binary_file_as_bitmap_from_stream(io.binary_input_stream::in,
    num_bytes::in, io.res::out, list(bitmap)::in, list(bitmap)::out,
    io::di, io::uo) is det.

read_binary_file_as_bitmap_from_stream(Stream, BufferSize, Res, !BMs, !IO) :-
    some [!BM] (
        !:BM = bitmap.init(BufferSize * bits_per_byte),
        bitmap.read_bitmap_range(Stream, 0, BufferSize, !BM, NumBytesRead,
            ReadRes, !IO),
        (
            ReadRes = ok,
            ( if NumBytesRead < BufferSize then
                !:BM = bitmap.shrink_without_copying(!.BM,
                    NumBytesRead * bits_per_byte),
                !:BMs = [!.BM | !.BMs],
                Res = ok
            else
                !:BMs = [!.BM | !.BMs],

                % Double the buffer size each time.
                read_binary_file_as_bitmap_from_stream(Stream, BufferSize * 2,
                    Res, !BMs, !IO)
            )
        ;
            ReadRes = error(Err),
            Res = error(Err)
        )
    ).

%---------------------%

    % XXX FIXME this should return an int64.
:- pred input_stream_file_size(io.text_input_stream::in, int::out,
    io::di, io::uo) is det.
:- pragma consider_used(pred(input_stream_file_size/4)).

input_stream_file_size(text_input_stream(Stream), Size, !IO) :-
    stream_file_size(Stream, Size64, !IO),
    Size = int64.cast_to_int(Size64).

:- pred binary_input_stream_file_size(io.binary_input_stream::in, int64::out,
    io::di, io::uo) is det.

binary_input_stream_file_size(binary_input_stream(Stream), Size, !IO) :-
    stream_file_size(Stream, Size, !IO).

    % stream_file_size(Stream, Size):
    %
    % If Stream is a regular file, then set Size to its size (in bytes),
    % otherwise set Size to -1.
    %
:- pred stream_file_size(stream::in, int64::out, io::di, io::uo) is det.

:- pragma foreign_proc("C",
    stream_file_size(Stream::in, Size::out, _IO0::di, _IO::uo),
    [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
        does_not_affect_liveness, no_sharing],
"
#if defined(MR_HAVE_FSTAT) && \
        (defined(MR_HAVE_FILENO) || defined(fileno)) && defined(S_ISREG)
    struct stat s;
    if (MR_IS_FILE_STREAM(*Stream)) {
        if (fstat(fileno(MR_file(*Stream)), &s) == 0 && S_ISREG(s.st_mode)) {
            Size = s.st_size;
        } else {
            Size = -1;
        }
    } else {
        Size = -1;
    }
#else
    Size = -1;
#endif
").

:- pragma foreign_proc("C#",
    stream_file_size(Stream::in, Size::out, _IO0::di, _IO::uo),
    [will_not_call_mercury, promise_pure, thread_safe],
"{
    if (Stream.stream.CanSeek) {
        Size = Stream.stream.Length;
    } else {
        Size = -1;
    }
}").

:- pragma foreign_proc("Java",
    stream_file_size(Stream::in, Size::out, _IO0::di, _IO::uo),
    [will_not_call_mercury, promise_pure, thread_safe, tabled_for_io],
"
    try {
        Size = ((jmercury.io__stream_ops.MR_BinaryFile) Stream).size();
    } catch (java.io.IOException e) {
        Size = -1;
    }
").

%---------------------%

% A buffer is an array of chars.
% For C backends, it is a C array of C chars.
% For other backends, it is a Mercury array of Mercury chars.

    % XXX It would be better to use a char_array type rather than array(char).
    % This is because on the Java (and maybe the C#) backend, indexing into
    % an array whose element type is known statically requires less overhead.
    %
    % It may be possible to merge with string.string_buffer.
    %
:- type buffer
    --->    buffer(array(char)).

:- pragma foreign_type(c, buffer, "char *", [can_pass_as_mercury_type]).

    % XXX Extend the workaround for no `ui' modes in array.m.
:- inst uniq_buffer for buffer/0
    --->    buffer(uniq_array).

:- mode buffer_di == di(uniq_buffer).
:- mode buffer_uo == out(uniq_buffer).

:- pred alloc_buffer(int::in, buffer::buffer_uo) is det.
:- pragma consider_used(pred(alloc_buffer/2)).

:- pragma foreign_proc("C",
    alloc_buffer(Size::in, Buffer::buffer_uo),
    [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
        does_not_affect_liveness, no_sharing],
"{
    MR_Word buf;
    MR_offset_incr_hp_atomic_msg(buf, 0,
        (Size * sizeof(char) + sizeof(MR_Word) - 1) / sizeof(MR_Word),
        MR_ALLOC_ID, ""io.buffer/0"");
    Buffer = (char *) buf;
}").

alloc_buffer(Size, buffer(Array)) :-
    char.det_from_int(0, NullChar),
    array.init(Size, NullChar, Array).

:- pred resize_buffer(int::in, int::in,
    buffer::buffer_di, buffer::buffer_uo) is det.

:- pragma foreign_proc("C",
    resize_buffer(OldSize::in, NewSize::in,
        Buffer0::buffer_di, Buffer::buffer_uo),
    [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
        does_not_affect_liveness],
"{
    MR_CHECK_EXPR_TYPE(Buffer0, char *);
    MR_CHECK_EXPR_TYPE(Buffer, char *);

#ifdef MR_CONSERVATIVE_GC
    Buffer = MR_GC_realloc(Buffer0, NewSize * sizeof(char));
#else
    if (Buffer0 + OldSize == (char *) MR_hp) {
        MR_Word next;
        MR_offset_incr_hp_atomic_msg(next, 0,
            (NewSize * sizeof(char) + sizeof(MR_Word) - 1)
                / sizeof(MR_Word),
            MR_ALLOC_ID, ""io.buffer/0"");
        assert(Buffer0 + OldSize == (char *) next);
        Buffer = Buffer0;
    } else {
        // Just have to alloc and copy.
        MR_Word buf;
        MR_offset_incr_hp_atomic_msg(buf, 0,
            (NewSize * sizeof(char) + sizeof(MR_Word) - 1)
                / sizeof(MR_Word),
            MR_ALLOC_ID, ""io.buffer/0"");
        Buffer = (char *) buf;
        if (OldSize > NewSize) {
            MR_memcpy(Buffer, Buffer0, NewSize);
        } else {
            MR_memcpy(Buffer, Buffer0, OldSize);
        }
    }
#endif
}").

resize_buffer(_OldSize, NewSize, buffer(Array0), buffer(Array)) :-
    char.det_from_int(0, Char),
    array.resize(NewSize, Char, Array0, Array).

:- pred buffer_and_pos_to_string_and_length(buffer::buffer_di, int::in,
    string::out, int::out) is semidet.
% This predicate is used when compiling to C and C#; this pragma avoids
% a warning when compiling to Java.
:- pragma consider_used(pred(buffer_and_pos_to_string_and_length/4)).

:- pragma foreign_proc("C",
    buffer_and_pos_to_string_and_length(Buffer::buffer_di, Pos::in,
        Str::out, NumCUs::out),
    [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
        does_not_affect_liveness],
"{
    Str = Buffer;
    Str[Pos] = '\\0';

    // Check that the string does not contain null characters.
    if (strlen(Str) != Pos) {
        SUCCESS_INDICATOR = MR_FALSE;
    } else {
        SUCCESS_INDICATOR = MR_TRUE;
    }

    // In C, Pos counts bytes, which are the same size as UTF-8 code units.
    // NumCUs is expected to be in the code units native to the target
    // language, and this is UTF-8, so no conversion needs to be done.
    // (Compare to the C# case below.)
    NumCUs = Pos;
}").

buffer_and_pos_to_string_and_length(buffer(Array), Pos, Str, NumCUs) :-
    % This predicate is used only when compiling to C and C#, and when
    % targeting C, we use the foreign_proc above, so this clause is used
    % only when targeting C#.
    %
    % In C#, Pos counts chars, i.e. code points. Most code points occupy
    % just one UTF-16 code unit, but some occupy two. The call below to
    % semidet_from_char_list will do this expansion as necessary.
    % We can't know how many code units the final string contains
    % until we count them. (Compare to the C case above.)
    %
    % XXX The current implementation of read_file_as_string_2
    % reads in code units one by one, converts them to code points
    % to store them in array slots, then converts the array to a string,
    % which converts each code point back into one or two UTF-16 code units.
    % A fully C#-specific implementation of read_file_as_string_2,
    % one not shared with C, should be able to dispense with all the
    % redundant conversions.
    array.fetch_items(Array, min(Array), min(Array) + Pos - 1, List),
    string.semidet_from_char_list(List, Str),
    string.length(Str, NumCUs).

:- pred read_into_buffer(stream::in, buffer::buffer_di, buffer::buffer_uo,
    int::in, int::in, int::out, system_error::out, io::di, io::uo) is det.

:- pragma foreign_proc("C",
    read_into_buffer(Stream::in, Buffer0::buffer_di, Buffer::buffer_uo,
        BufferSize::in, Pos0::in, Pos::out, Error::out, _IO0::di, _IO::uo),
    [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
        does_not_affect_liveness],
"
    size_t bytes_to_read;
    size_t bytes_read;

    MR_CHECK_EXPR_TYPE(Buffer0, char *);
    MR_CHECK_EXPR_TYPE(Buffer, char *);

    bytes_to_read = BufferSize - Pos0;
    bytes_read = MR_READ(*Stream, Buffer0 + Pos0, bytes_to_read);

    Buffer = Buffer0;
    Pos = Pos0 + bytes_read;

    if (bytes_read < bytes_to_read && MR_FERROR(*Stream)) {
        Error = errno;
    } else {
        Error = 0;
    }
").

read_into_buffer(Stream, buffer(Array0), buffer(Array), BufferSize,
        !Pos, Error, !IO) :-
    % This predicate is used only when compiling to C and C#, and when
    % targeting C, we use the foreign_proc above, so this clause is used
    % only when targeting C#.
    read_into_array(text_input_stream(Stream), Array0, Array, BufferSize,
        !Pos, Error, !IO).

:- pred read_into_array(io.text_input_stream::in,
    array(char)::array_di, array(char)::array_uo, int::in, int::in, int::out,
    system_error::out, io::di, io::uo) is det.
% This predicate is not used when compiling to C or Java; this pragma avoids
% a warning even in those cases.
:- pragma consider_used(pred(read_into_array/9)).

read_into_array(Stream, !Array, ArraySize, !Pos, Error, !IO) :-
    ( if !.Pos >= ArraySize then
        Error = no_error
    else
        read_char_code(Stream, ResultCode, Error0, Char, !IO),
        (
            ResultCode = result_code_ok,
            array.set(!.Pos, Char, !Array),
            !:Pos = !.Pos + 1,
            read_into_array(Stream, !Array, ArraySize, !Pos, Error, !IO)
        ;
            ( ResultCode = result_code_eof
            ; ResultCode = result_code_error
            ),
            Error = Error0
        )
    ).

%---------------------------------------------------------------------------%
:- end_module io.text_read.
%---------------------------------------------------------------------------%