mercury/mdbcomp/feedback.automatic_parallelism.m

%---------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%---------------------------------------------------------------------------%
% Copyright (C) 2010-2011 The University of Melbourne.
% Copyright (C) 2014-2015, 2017-2018, 2022-2023, 2025 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%---------------------------------------------------------------------------%
%
% File: feedback.automatic_parallelism.m.
% Main author: pbone.
%
% This module defines data structures for representing automatic parallelism
% feedback information and some procedures for working with these structures.
%
% NOTE: After modifying any of these structures please increment the
% feedback_version in feedback.m
%
%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%

:- module mdbcomp.feedback.automatic_parallelism.

:- interface.

:- import_module mdbcomp.goal_path.
:- import_module mdbcomp.program_representation.

:- import_module assoc_list.
:- import_module bool.
:- import_module list.
:- import_module maybe.
:- import_module set.

%---------------------------------------------------------------------------%

:- type candidate_par_conjunctions_params
    --->    candidate_par_conjunctions_params(
                % The number of desired busy sparks.
                cpcp_desired_parallelism    :: float,

                % Should we follow variable use across module boundaries?
                cpcp_intermodule_var_use    :: bool,

                % The cost of creating a spark and adding it to the local
                % work queue, measured in call sequence counts.
                cpcp_sparking_cost          :: int,

                % The time taken between the creation of the spark and when
                % it starts being executed, measured in call sequence counts.
                cpcp_sparking_delay         :: int,

                % The cost of barrier synchronisation for each conjunct at the
                % end of the parallel conjunction.
                cpcp_barrier_cost           :: int,

                % The costs of maintaining a lock on a single dependent
                % variable, measured in call sequence counts. The first number
                % gives the cost of the call to signal, and the second gives
                % the cost of the call to wait assuming that the value is
                % already available.
                cpcp_future_signal_cost     :: int,
                cpcp_future_wait_cost       :: int,

                % The time it takes for a context to resume execution once
                % it has been put on the runnable queue, assuming that an
                % engine is available to pick it up. Measured in call sequence
                % counts.
                %
                % We use this to calculate how soon a context can recover
                % after being blocked by a future. It is also used to determine
                % how quickly the context executing MR_join_and_continue after
                % completing the leftmost conjunct of a parallel conjunction
                % can recover after being blocked on the completion of
                % one of the other conjuncts.
                cpcp_context_wakeup_delay   :: int,

                % The cost threshold in call sequence counts of a clique
                % before we consider it for parallel execution.
                cpcp_clique_threshold       :: int,

                % The cost threshold in call sequence counts of a call site
                % before we consider it for parallel execution.
                cpcp_call_site_threshold    :: int,

                % The speedup we require before we allow a conjunction to be
                % automatically parallelised. Should be either exactly 1.0
                % or just above 1.0.
                cpcp_speedup_threshold      :: float,

                % Whether we will allow parallelisation to result in
                % dependent parallel conjunctions, and if so, how we estimate
                % the speedup we get for them.
                cpcp_parallelise_dep_conjs  :: parallelise_dep_conjs,

                cpcp_alg_for_best_par       :: alg_for_finding_best_par
            ).

:- type parallelise_dep_conjs
    --->    do_not_parallelise_dep_conjs
    ;       parallelise_dep_conjs(speedup_estimate_alg).

:- type speedup_estimate_alg
    --->    estimate_speedup_naively
            % Be naive to dependent parallelism, pretend it is independent.

    ;       estimate_speedup_by_overlap.
            % Use the overlap calculation for dependent parallelism.

    % This type is used to select which algorithm is used to find the most
    % profitable parallelisation of a particular conjunction.
    %
    % TODO: The type name could be improved to make it distinct from the
    % algorithm used to search through the clique graph.
    %
:- type alg_for_finding_best_par
    --->    affbp_complete_branches(
                % Use the complete algorithm until this many branches have been
                % created during the search, and then fall back to the greedy
                % algorithm. After such a fall back, all existing alternatives
                % will be explored, but no new ones will be generated.
                int
            )
    ;       affbp_complete_size(
                % Use the complete algorithm for conjunctions with fewer than
                % this many conjuncts, or a greedy algorithm. The recommended
                % value is 50.
                % XXX I (zs) think that 50 seems way too high.
                int
            )
    ;       affbp_complete
            % Use the complete brand-and-bound algorithm with no fallback.
    ;       affbp_greedy.
            % Use the linear greedy algorithm.

%---------------------------------------------------------------------------%

    % Represent the metrics of a parallel execution.
    %
:- type parallel_exec_metrics
    --->    parallel_exec_metrics(
                % The number of calls into this parallelisation.
                pem_num_calls               :: int,

                % The elapsed time of the original sequential execution.
                pem_seq_time                :: float,

                % The estimated elapsed time of the parallel execution.
                pem_par_time                :: float,

                % The overheads of parallel execution. These are already
                % included in pem_par_time. Overheads are separated into
                % different causes.
                pem_par_overhead_spark_cost :: float,
                pem_par_overhead_barrier    :: float,
                pem_par_overhead_signals    :: float,
                pem_par_overhead_waits      :: float,

                % The amount of time the initial (left most) conjunct spends
                % waiting for the other conjuncts. During this time,
                % the context used by this conjunct must be kept alive
                % because it will resume executing sequential code after
                % the conjunct, however we know that it cannot be resumed
                % before its children have completed.
                pem_first_conj_dead_time    :: float,

                % The amount of time all conjuncts spend blocked on the
                % production of futures.
                pem_future_dead_time        :: float
            ).

    % The speedup per call: SeqTime / ParTime. For example, a value of 2.0
    % means that the goal is twice as fast when parallelised.
    %
:- func parallel_exec_metrics_get_speedup(parallel_exec_metrics) = float.

    % The amount of time saved per-call: SeqTime - ParTime.
    %
:- func parallel_exec_metrics_get_time_saving(parallel_exec_metrics) = float.

    % The amount of time spent 'on cpu', (seq time plus non-dead overheads).
    %
:- func parallel_exec_metrics_get_cpu_time(parallel_exec_metrics) = float.

    % The overheads of parallel execution.
    %
    % Add these to pem_seq_time to get the 'time on cpu' of this execution.
    %
:- func parallel_exec_metrics_get_overheads(parallel_exec_metrics) = float.

%---------------------------------------------------------------------------%

    % The set of candidate parallel conjunctions within a procedure.
    %
:- type candidate_par_conjunctions_proc ==
    candidate_par_conjunctions_proc(pard_goal).
:- type candidate_par_conjunctions_proc(GoalType)
    --->    candidate_par_conjunctions_proc(
                % A variable name table for the variables that have
                % sensible names.
                cpcp_var_table  :: var_name_table,

                % Each push represents a program transformation.
                % Most of the time, we expect the list to be empty,
                % but if it isn't, then the list of candidate conjunctions
                % is valid only AFTER the transformations described
                % by this list have been applied. (The transformations
                % should be independent of one another, so it should be
                % OK to apply them in any order.)
                cpcp_push_goals :: list(push_goal),

                cpcp_par_conjs  :: list(candidate_par_conjunction(GoalType))
            ).

    % This goal describes 'push goal' transformations.
    %
    % This is where a goal may be pushed into the arms of a branching goal that
    % occurs before it in the same conjunction. It can allow the pushed goal
    % to be parallelised against goals in one or more branches without
    % parallelising the whole branch goal (whose per-call cost may be
    % too small).
    %
:- type push_goal
    --->    push_goal(
                % The goal path of the conjunction in which the push is done.
                pg_goal_path    :: goal_path_string,

                % The range of conjuncts to push (inclusive).
                pg_pushee_lo    :: int,
                pg_pushee_hi    :: int,

                % The set of expensive goals inside earlier conjuncts in that
                % conjunction "next" to which the pushee goals should be
                % pushed. By "next", we mean that the pushee goals should be
                % added to the end of whatever conjunction contains the
                % expensive goal, creating a containing conjunction if
                % there wasn't one there before.
                %
                % Each of these expensive goals should be on a different
                % execution path.
                %
                % This list should not be empty.
                pg_pushed_into  :: list(goal_path_string)
            ).

    % A conjunction that is a candidate for parallelisation, it is identified
    % by a procedure label, goal path to the conjunction and the call sites
    % within the conjunction that are to be parallelised.
    %
    % TODO: In the future support more expressive candidate parallel
    % conjunctions, so that more opportunities for parallelism can be found.
    % Although it's probably not a good idea to parallelise three conjuncts or
    % more against one another without first having a good system for reaching
    % and maintaining the target amount of parallelism, this may involve
    % distance granularity.
    %
:- type candidate_par_conjunction(GoalType)
    --->    candidate_par_conjunction(
                % The path within the procedure to this conjunction.
                cpc_goal_path           :: goal_path_string,

                % If the candidate is dependent on a push being performed,
                % what is that push? Note that any push that specifies the same
                % goals being pushed and the same OR GREATER set of goals next
                % to which to push them is acceptable: if such a push is
                % performed, then this candidate is viable.
                cpc_maybe_push_goal     :: maybe(push_goal),

                % The position within the original conjunction that this
                % parallelisation starts.
                cpc_first_conj_num      :: int,

                cpc_is_dependent        :: conjuncts_are_dependent,

                cpc_goals_before        :: list(GoalType),
                cpc_goals_before_cost   :: float,

                % A list of parallel conjuncts, each is a sequential
                % conjunction of inner goals. All inner goals that are
                % seen in the program presentation must be stored here
                % unless they are to be scheduled before or after the
                % sequential conjunction. If these conjuncts are flattened,
                % the inner goals will appear in the same order as the
                % program representation. By maintaining these two rules
                % the compiler and analysis tools can use similar
                % algorithms to construct the same parallel conjunction
                % from the same program representation/HLDS structure.

                cpc_conjs               :: list(seq_conj(GoalType)),

                cpc_goals_after         :: list(GoalType),
                cpc_goals_after_cost    :: float,

                cpc_par_exec_metrics    :: parallel_exec_metrics
            ).

:- type conjuncts_are_dependent
    --->    conjuncts_are_dependent(set(var_rep))
    ;       conjuncts_are_independent.

:- type seq_conj(GoalType)
    --->    seq_conj(
                sc_conjs            :: list(GoalType)
            ).

:- type callee_rep
    --->    unknown_callee
            % An unknown callee such as a higher order or method call.

    ;       named_callee(
                % A known callee. Note that arity and mode are not stored at
                % all. XXX why?

                nc_module_name  :: string,
                nc_proc_name    :: string
            ).

%---------------------------------------------------------------------------%

    % A parallelised goal (pard_goal), a goal within a parallel conjunction.
    % We don't yet have to represent many types of goals or details about them.
    %
:- type pard_goal == goal_rep(pard_goal_annotation).

:- type pard_goal_annotation
    --->    pard_goal_annotation(
                % The per-call cost of this call in call sequence counts.
                pga_cost_percall            :: float,

                pga_coat_above_threshold    :: cost_above_par_threshold,

                % Variable use information.
                pga_var_productions         :: assoc_list(var_rep, float),
                pga_var_consumptions        :: assoc_list(var_rep, float)
            ).

:- type cost_above_par_threshold
    --->    cost_above_par_threshold
            % The goal has a significant enough cost to be considered for
            % parallelisation.

    ;       cost_not_above_par_threshold.
            % The goal is too cheap to be considered for parallelisation.
            % We track it in the feedback information to help inform the
            % compiler about _how_ to parallelise calls around it.

%---------------------------------------------------------------------------%

:- pred convert_candidate_par_conjunctions_proc(
    pred(candidate_par_conjunction(A), A, B)::in(pred(in, in, out) is det),
    candidate_par_conjunctions_proc(A)::in,
    candidate_par_conjunctions_proc(B)::out) is det.

:- pred convert_candidate_par_conjunction(
    pred(candidate_par_conjunction(A), A, B)::in(pred(in, in, out) is det),
    candidate_par_conjunction(A)::in, candidate_par_conjunction(B)::out)
    is det.

:- pred convert_seq_conj(
    pred(A, B)::in(pred(in, out) is det),
    seq_conj(A)::in, seq_conj(B)::out) is det.

%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%

:- implementation.

:- import_module float.

%---------------------------------------------------------------------------%

parallel_exec_metrics_get_speedup(PEM) = SeqTime / ParTime :-
    SeqTime = PEM ^ pem_seq_time,
    ParTime = PEM ^ pem_par_time.

parallel_exec_metrics_get_time_saving(PEM) = SeqTime - ParTime :-
    SeqTime = PEM ^ pem_seq_time,
    ParTime = PEM ^ pem_par_time.

parallel_exec_metrics_get_cpu_time(PEM) = SeqTime + Overheads :-
    SeqTime = PEM ^ pem_seq_time,
    Overheads = parallel_exec_metrics_get_overheads(PEM).

parallel_exec_metrics_get_overheads(PEM) =
        SparkCosts + BarrierCosts + SignalCosts + WaitCosts :-
    PEM = parallel_exec_metrics(_, _, _, SparkCosts, BarrierCosts,
        SignalCosts, WaitCosts, _, _).

%---------------------------------------------------------------------------%
%
% Helper predicates for the candidate parallel conjunctions type.
%

convert_candidate_par_conjunctions_proc(Conv, CPCProcA, CPCProcB) :-
    CPCProcA = candidate_par_conjunctions_proc(VarTable, PushGoals, CPCA),
    list.map(convert_candidate_par_conjunction(Conv), CPCA, CPCB),
    CPCProcB = candidate_par_conjunctions_proc(VarTable, PushGoals, CPCB).

convert_candidate_par_conjunction(Conv0, CPC0, CPC) :-
    CPC0 = candidate_par_conjunction(GoalPath, MaybePushGoal, FirstGoalNum,
        IsDependent, GoalsBefore0, GoalsBeforeCost, Conjs0,
        GoalsAfter0, GoalsAfterCost, Metrics),
    Conv = (pred(A::in, B::out) is det :-
            Conv0(CPC0, A, B)
        ),
    list.map(convert_seq_conj(Conv), Conjs0, Conjs),
    list.map(Conv, GoalsBefore0, GoalsBefore),
    list.map(Conv, GoalsAfter0, GoalsAfter),
    CPC = candidate_par_conjunction(GoalPath, MaybePushGoal, FirstGoalNum,
        IsDependent, GoalsBefore, GoalsBeforeCost, Conjs,
        GoalsAfter, GoalsAfterCost, Metrics).

convert_seq_conj(Conv, seq_conj(Conjs0), seq_conj(Conjs)) :-
    list.map(Conv, Conjs0, Conjs).

%---------------------------------------------------------------------------%
:- end_module mdbcomp.feedback.automatic_parallelism.
%---------------------------------------------------------------------------%