# ------------------------------------------------------------------------------
# (C) Crown copyright Met Office. All rights reserved.
# For further details please refer to the file COPYRIGHT.txt
# which you should have received as part of this distribution.
# ------------------------------------------------------------------------------
use strict;
use warnings;

# ------------------------------------------------------------------------------
package Fcm::Build::Fortran;

use Text::Balanced qw{extract_bracketed extract_delimited};

# Actions of this class
my %ACTION_OF = (extract_interface => \&_extract_interface);

# Regular expressions
# Matches a variable attribute
my $RE_ATTR = qr{
    allocatable|dimension|external|intent|optional|parameter|pointer|save|target
}imsx;
# Matches a name
my $RE_NAME = qr{[A-Za-z]\w*}imsx;
# Matches a specification type
my $RE_SPEC = qr{
    character|complex|double\s*precision|integer|logical|real|type
}imsx;
# Matches the identifier of a program unit that does not have arguments
my $RE_UNIT_BASE = qr{block\s*data|module|program}imsx;
# Matches the identifier of a program unit that has arguments
my $RE_UNIT_CALL = qr{function|subroutine}imsx;
# Matches the identifier of any program unit
my $RE_UNIT      = qr{$RE_UNIT_BASE|$RE_UNIT_CALL}msx;
my %RE = (
    # A comment line
    COMMENT     => qr{\A\s*(?:!|\z)}msx,
    # A trailing comment, capture the expression before the comment
    COMMENT_END => qr{\A([^'"]*?)\s*!.*\z}msx,
    # A contination marker, capture the expression before the marker
    CONT        => qr{\A(.*)&\s*\z}msx,
    # A contination marker at the beginning of a line, capture the marker and
    # the expression after the marker
    CONT_LEAD   => qr{\A(\s*&)(.*)\z}msx,
    # Capture a variable identifier, removing any type component expression
    NAME_COMP   => qr{\b($RE_NAME)(?:\s*\%\s*$RE_NAME)*\b}msx,
    # Matches the first identifier in a line
    NAME_LEAD   => qr{\A\s*$RE_NAME\s*}msx,
    # Captures a name identifier after a comma, and the expression after
    NAME_LIST   => qr{\A(?:.*?)\s*,\s*($RE_NAME)\b(.*)\z}msx,
    # Captures the next quote character
    QUOTE       => qr{\A[^'"]*(['"])}msx,
    # Matches an attribute declaration
    TYPE_ATTR   => qr{\A\s*($RE_ATTR)\b}msx,
    # Matches a type declaration
    TYPE_SPEC   => qr{\A\s*($RE_SPEC)\b}msx,
    # Captures the expression after one or more program unit attributes
    UNIT_ATTR   => qr{\A\s*(?:(?:elemental|recursive|pure)\s+)+(.*)\z}imsx,
    # Captures the identifier and the symbol of a program unit with no arguments
    UNIT_BASE   => qr{\A\s*($RE_UNIT_BASE)\s+($RE_NAME)\s*\z}imsx,
    # Captures the identifier and the symbol of a program unit with arguments
    UNIT_CALL   => qr{\A\s*($RE_UNIT_CALL)\s+($RE_NAME)\b}imsx,
    # Captures the end of a program unit, its identifier and its symbol
    UNIT_END    => qr{\A\s*(end)(?:\s+($RE_NAME)(?:\s+($RE_NAME))?)?\s*\z}imsx,
    # Captures the expression after a program unit type specification
    UNIT_SPEC   => qr{\A\s*$RE_SPEC\b(.*)\z}imsx,
);

# Keywords in type declaration statements
my %TYPE_DECL_KEYWORD_SET = map { ($_, 1) } qw{
    allocatable
    dimension
    in
    inout
    intent
    kind
    len
    optional
    out
    parameter
    pointer
    save
    target
};

# Creates and returns an instance of this class.
sub new {
    my ($class) = @_;
    bless(
        sub {
            my $key = shift();
            if (!exists($ACTION_OF{$key})) {
                return;
            }
            $ACTION_OF{$key}->(@_);
        },
        $class,
    );
}

# Methods.
for my $key (keys(%ACTION_OF)) {
    no strict qw{refs};
    *{$key} = sub { my $self = shift(); $self->($key, @_) };
}

# Extracts the calling interfaces of top level subroutines and functions from
# the $handle for reading Fortran sources.
sub _extract_interface {
    my ($handle) = @_;
    map { _present_line($_) } @{_reduce_to_interface(_load($handle))};
}

# Reads $handle for the next Fortran statement, handling continuations.
sub _load {
    my ($handle) = @_;
    my $ctx = {signature_token_set_of => {}, statements => []};
    my $state = {
        in_contains  => undef, # in a "contains" section of a program unit
        in_interface => undef, # in an "interface" block
        in_quote     => undef, # in a multi-line quote
        stack        => [],    # program unit stack
    };
    my $NEW_STATEMENT = sub {
        {   name        => q{}, # statement name, e.g. function, integer, ...
            lines       => [],  # original lines in the statement
            line_number => 0,   # line number (start) in the original source
            symbol      => q{}, # name of a program unit (signature, end)
            type        => q{}, # e.g. signature, use, type, attr, end
            value       => q{}, # the actual value of the statement
        };
    };
    my $statement;
LINE:
    while (my $line = readline($handle)) {
        if (!defined($statement)) {
            $statement = $NEW_STATEMENT->();
        }
        my $value = $line;
        chomp($value);
        # Pre-processor directives and continuation
        if (!$statement->{line_number} && index($value, '#') == 0) {
            $statement->{line_number} = $.;
            $statement->{name}        = 'cpp';
        }
        if ($statement->{name} eq 'cpp') {
            push(@{$statement->{lines}}, $line);
            $statement->{value} .= $value;
            if (rindex($value, '\\') != length($value) - 1) {
                $statement = undef;
            }
            next LINE;
        }
        # Normal Fortran
        if ($value =~ $RE{COMMENT}) {
            next LINE;
        }
        if (!$statement->{line_number}) {
            $statement->{line_number} = $.;
        }
        my ($cont_head, $cont_tail);
        if ($statement->{line_number} != $.) { # is a continuation
            ($cont_head, $cont_tail) = $value =~ $RE{CONT_LEAD};
            if ($cont_head) {
                $value = $cont_tail;
            }
        }
        # Correctly handle ! and & in quotes
        my ($head, $tail) = (q{}, $value);
        if ($state->{in_quote} && index($value, $state->{in_quote}) >= 0) {
            my $index = index($value, $state->{in_quote});
            $head = substr($value, 0, $index + 1);
            $tail
                = length($value) > $index + 1
                ? substr($value, $index + 2)
                : q{};
            $state->{in_quote} = undef;
        }
        if (!$state->{in_quote}) {
            while ($tail) {
                if (index($tail, q{!}) >= 0) {
                    if (!($tail =~ s/$RE{COMMENT_END}/$1/)) {
                        ($head, $tail, $state->{in_quote})
                            = _load_extract_quote($head, $tail);
                    }
                }
                else {
                    while (index($tail, q{'}) > 0
                        || index($tail, q{"}) > 0)
                    {
                        ($head, $tail, $state->{in_quote})
                            = _load_extract_quote($head, $tail);
                    }
                    $head .= $tail;
                    $tail = q{};
                }
            }
        }
        $cont_head ||= q{};
        push(@{$statement->{lines}}, $cont_head . $head . $tail . "\n");
        $statement->{value} .= $head . $tail;
        # Process a statement only if it is marked with a continuation
        if (!($statement->{value} =~ s/$RE{CONT}/$1/)) {
            $statement->{value} =~ s{\s+\z}{}msx;
            if (_process($statement, $ctx, $state)) {
                push(@{$ctx->{statements}}, $statement);
            }
            $statement = undef;
        }
    }
    return $ctx;
}

# Helper, removes a quoted string from $tail.
sub _load_extract_quote {
    my ($head, $tail) = @_;
    my ($extracted, $remainder, $prefix)
        = extract_delimited($tail, q{'"}, qr{[^'"]*}msx, q{});
    if ($extracted) {
        return ($head . $prefix . $extracted, $remainder);
    }
    else {
        my ($quote) = $tail =~ $RE{QUOTE};
        return ($head . $tail, q{}, $quote);
    }
}

# Study statements and put attributes into array $statements
sub _process {
    my ($statement, $ctx, $state) = @_;
    my $name;

    # End Interface
    if ($state->{in_interface}) {
        if ($statement->{value} =~ qr{\A\s*end\s*interface\b}imsx) {
            $state->{in_interface} = 0;
        }
        return;
    }

    # End Program Unit
    if (@{$state->{stack}} && $statement->{value} =~ qr{\A\s*end\b}imsx) {
        my ($end, $type, $symbol) = lc($statement->{value}) =~ $RE{UNIT_END};
        if (!$end) {
            return;
        }
        my ($top_type, $top_symbol) = @{$state->{stack}->[-1]};
        if (!$type
            || $top_type eq $type && (!$symbol || $top_symbol eq $symbol))
        {
            pop(@{$state->{stack}});
            if ($state->{in_contains} && !@{$state->{stack}}) {
                $state->{in_contains} = 0;
            }
            if (!$state->{in_contains}) {
                $statement->{name}   = $top_type;
                $statement->{symbol} = $top_symbol;
                $statement->{type}   = 'end';
                return $statement;
            }
        }
        return;
    }

    # Interface/Contains
    ($name) = $statement->{value} =~ qr{\A\s*(contains|interface)\b}imsx;
    if ($name) {
        $state->{'in_' . lc($name)} = 1;
        return;
    }

    # Program Unit
    my ($type, $symbol, @tokens) = _process_prog_unit($statement->{value});
    if ($type) {
        push(@{$state->{stack}}, [$type, $symbol]);
        if ($state->{in_contains}) {
            return;
        }
        $statement->{name}   = lc($type);
        $statement->{type}   = 'signature';
        $statement->{symbol} = lc($symbol);
        $ctx->{signature_token_set_of}{$symbol}
            = {map { (lc($_) => 1) } @tokens};
        return $statement;
    }
    if ($state->{in_contains}) {
        return;
    }

    # Use
    if ($statement->{value} =~ qr{\A\s*(use)\b}imsx) {
        $statement->{name} = 'use';
        $statement->{type} = 'use';
        return $statement;
    }

    # Type Declarations
    ($name) = $statement->{value} =~ $RE{TYPE_SPEC};
    if ($name) {
        $name =~ s{\s}{}gmsx;
        $statement->{name} = lc($name);
        $statement->{type} = 'type';
        return $statement;
    }

    # Attribute Statements
    ($name) = $statement->{value} =~ $RE{TYPE_ATTR};
    if ($name) {
        $statement->{name} = $name;
        $statement->{type} = 'attr';
        return $statement;
    }
}

# Parse a statement for program unit header. Returns a list containing the type,
# the symbol and the signature tokens of the program unit.
sub _process_prog_unit {
    my ($string) = @_;
    my ($type, $symbol, @args) = (q{}, q{});
    # Is it a blockdata, module or program?
    ($type, $symbol) = $string =~ $RE{UNIT_BASE};
    if ($type) {
        $type = lc($type);
        $type =~ s{\s*}{}gmsx;
        return ($type, $symbol);
    }
    # Remove the attribute and type declaration of a procedure
    $string =~ s/$RE{UNIT_ATTR}/$1/;
    my ($match) = $string =~ $RE{UNIT_SPEC};
    if ($match) {
        $string = $match;
        extract_bracketed($string);
    }
    # Is it a function or subroutine?
    ($type, $symbol) = lc($string) =~ $RE{UNIT_CALL};
    if (!$type) {
        return;
    }
    my $extracted = extract_bracketed($string, q{()}, qr{[^(]*}msx);

    # Get signature tokens from SUBROUTINE/FUNCTION
    if ($extracted) {
        $extracted =~ s{\s}{}gmsx;
        @args = split(q{,}, substr($extracted, 1, length($extracted) - 2));
        if ($type eq 'function') {
            my $result = extract_bracketed($string, q{()}, qr{[^(]*}msx);
            if ($result) {
                $result =~ s{\A\(\s*(.*?)\s*\)\z}{$1}msx; # remove braces
                push(@args, $result);
            }
            else {
                push(@args, $symbol);
            }
        }
    }
    return (lc($type), lc($symbol), map { lc($_) } @args);
}

# Reduces the list of statements to contain only the interface block.
sub _reduce_to_interface {
    my ($ctx) = @_;
    my (%token_set, @interface_statements);
STATEMENT:
    for my $statement (reverse(@{$ctx->{statements}})) {
        if ($statement->{type} eq 'end'
            && grep { $_ eq $statement->{name} } qw{subroutine function})
        {
            push(@interface_statements, $statement);
            %token_set
                = %{$ctx->{signature_token_set_of}{$statement->{symbol}}};
            next STATEMENT;
        }
        if ($statement->{type} eq 'signature'
            && grep { $_ eq $statement->{name} } qw{subroutine function})
        {
            push(@interface_statements, $statement);
            %token_set = ();
            next STATEMENT;
        }
        if ($statement->{type} eq 'use') {
            my ($head, $tail)
                = split(qr{\s*:\s*}msx, lc($statement->{value}), 2);
            if ($tail) {
                my @imports = map { [split(qr{\s*=>\s*}msx, $_, 2)] }
                    split(qr{\s*,\s*}msx, $tail);
                my @useful_imports
                    = grep { exists($token_set{$_->[0]}) } @imports;
                if (!@useful_imports) {
                    next STATEMENT;
                }
                if (@imports != @useful_imports) {
                    my @token_strings
                        = map { $_->[0] . ($_->[1] ? ' => ' . $_->[1] : q{}) }
                        @useful_imports;
                    my ($last, @rest) = reverse(@token_strings);
                    my @token_lines
                        = (reverse(map { $_ . q{,&} } @rest), $last);
                    push(
                        @interface_statements,
                        {   lines => [
                                sprintf("%s:&\n", $head),
                                (map { sprintf(" & %s\n", $_) } @token_lines),
                            ]
                        },
                    );
                    next STATEMENT;
                }
            }
            push(@interface_statements, $statement);
            next STATEMENT;
        }
        if ($statement->{type} eq 'attr') {
            my ($spec, @tokens) = ($statement->{value} =~ /$RE{NAME_COMP}/g);
            if (grep { exists($token_set{$_}) } @tokens) {
                for my $token (@tokens) {
                    $token_set{$token} = 1;
                }
                push(@interface_statements, $statement);
                next STATEMENT;
            }
        }
        if ($statement->{type} eq 'type') {
            my ($variable_string, $spec_string)
                = reverse(split('::', lc($statement->{value}), 2));
            if ($spec_string) {
                $spec_string =~ s{$RE{NAME_LEAD}}{}msx;
            }
            else {
                # The first expression in the statement is the type + attrib
                $variable_string =~ s{$RE{NAME_LEAD}}{}msx;
                $spec_string = extract_bracketed($variable_string, '()',
                    qr{[\s\*]*}msx);
            }
            # Useful tokens are those that comes after a comma
            my $tail = q{,} . lc($variable_string);
            my @tokens;
            while ($tail) {
                if ($tail =~ qr{\A\s*['"]}msx) {
                    extract_delimited($tail, q{'"}, qr{\A[^'"]*}msx, q{});
                }
                elsif ($tail =~ qr{\A\s*\(}msx) {
                    extract_bracketed($tail, '()', qr{\A[^(]*}msx);
                }
                else {
                    my $token;
                    ($token, $tail) = $tail =~ $RE{NAME_LIST};
                    if ($token && $token_set{$token}) {
                        @tokens = ($variable_string =~ /$RE{NAME_COMP}/g);
                        $tail = q{};
                    }
                }
            }
            if (@tokens && $spec_string) {
                my @spec_tokens = (lc($spec_string) =~ /$RE{NAME_COMP}/g);
                push(
                    @tokens,
                    (   grep { !exists($TYPE_DECL_KEYWORD_SET{$_}) }
                            @spec_tokens
                    ),
                );
            }
            if (grep { exists($token_set{$_}) } @tokens) {
                for my $token (@tokens) {
                    $token_set{$token} = 1;
                }
                push(@interface_statements, $statement);
                next STATEMENT;
            }
        }
    }
    if (!@interface_statements) {
        return [];
    }
    [   {lines => ["interface\n"]},
        reverse(@interface_statements),
        {lines => ["end interface\n"]},
    ];
}

# Processes and returns the line of the statement.
sub _present_line {
    my ($statement) = @_;
    map {
        s{\s+}{ }gmsx;      # collapse multiple spaces
        s{\s+\z}{\n}msx;    # remove trailing spaces
        $_;
    } @{$statement->{lines}};
}

# ------------------------------------------------------------------------------
1;
__END__

=head1 NAME

Fcm::Build::Fortran

=head1 SYNOPSIS

    use Fcm::Build::Fortran;
    my $fortran_util = Fcm::Build::Fortran->new();
    open(my($handle), '<', $path_to_a_fortran_source_file);
    print($fortran_util->extract_interface($handle)); # prints interface
    close($handle);

=head1 DESCRIPTION

A class to analyse Fortran source. Currently, it has a single method to extract
the calling interfaces of top level subroutines and functions in a Fortran
source.

=head1 METHODS

=over 4

=item $class->new()

Creates and returns an instance of this class.

=item $instance->extract_interface($handle)

Extracts the calling interfaces of top level subroutines and functions in a
Fortran source that can be read from $handle. Returns an interface block as a
list of lines.

=back

=head1 ACKNOWLEDGEMENT

This module is inspired by the logic developed by the European Centre
for Medium-Range Weather Forecasts (ECMWF).

=head1 COPYRIGHT

(C) Crown copyright Met Office. All rights reserved.

=cut
