mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-06 07:49:02 +00:00
mdbcomp/feedback.feedback_info.m:
mdbcomp/feedback.m:
Move all the contents of feedback.m other than include_module items
into the new module feedback_info.m, leaving feedback.m a package.
mdbcomp/feedback.automatic_parallelism.m:
Group the types exported by this module into groups of related types.
Delete a totally unused type.
deep_profiler/Mmakefile:
slice/Mmakefile:
Copy the new module.
compiler/check_options.m:
compiler/globals.m:
compiler/handle_options.m:
compiler/introduce_parallelism.m:
deep_profiler/autopar_reports.m:
deep_profiler/autopar_search_callgraph.m:
deep_profiler/mdprof_create_feedback.m:
deep_profiler/mdprof_report_feedback.m:
Import the new module.
418 lines
18 KiB
Mathematica
418 lines
18 KiB
Mathematica
%---------------------------------------------------------------------------%
|
|
% vim: ft=mercury ts=4 sw=4 et
|
|
%---------------------------------------------------------------------------%
|
|
% Copyright (C) 2010-2011 The University of Melbourne.
|
|
% Copyright (C) 2014-2015, 2017-2018, 2022-2023, 2025 The Mercury team.
|
|
% This file is distributed under the terms specified in COPYING.LIB.
|
|
%---------------------------------------------------------------------------%
|
|
%
|
|
% File: feedback.automatic_parallelism.m.
|
|
% Main author: pbone.
|
|
%
|
|
% This module defines data structures for representing automatic parallelism
|
|
% feedback information and some procedures for working with these structures.
|
|
%
|
|
% NOTE: After modifying any of these structures please increment the
|
|
% feedback_version in feedback.m
|
|
%
|
|
%---------------------------------------------------------------------------%
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- module mdbcomp.feedback.automatic_parallelism.
|
|
|
|
:- interface.
|
|
|
|
:- import_module mdbcomp.goal_path.
|
|
:- import_module mdbcomp.program_representation.
|
|
|
|
:- import_module assoc_list.
|
|
:- import_module bool.
|
|
:- import_module list.
|
|
:- import_module maybe.
|
|
:- import_module set.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- type candidate_par_conjunctions_params
|
|
---> candidate_par_conjunctions_params(
|
|
% The number of desired busy sparks.
|
|
cpcp_desired_parallelism :: float,
|
|
|
|
% Should we follow variable use across module boundaries?
|
|
cpcp_intermodule_var_use :: bool,
|
|
|
|
% The cost of creating a spark and adding it to the local
|
|
% work queue, measured in call sequence counts.
|
|
cpcp_sparking_cost :: int,
|
|
|
|
% The time taken between the creation of the spark and when
|
|
% it starts being executed, measured in call sequence counts.
|
|
cpcp_sparking_delay :: int,
|
|
|
|
% The cost of barrier synchronisation for each conjunct at the
|
|
% end of the parallel conjunction.
|
|
cpcp_barrier_cost :: int,
|
|
|
|
% The costs of maintaining a lock on a single dependent
|
|
% variable, measured in call sequence counts. The first number
|
|
% gives the cost of the call to signal, and the second gives
|
|
% the cost of the call to wait assuming that the value is
|
|
% already available.
|
|
cpcp_future_signal_cost :: int,
|
|
cpcp_future_wait_cost :: int,
|
|
|
|
% The time it takes for a context to resume execution once
|
|
% it has been put on the runnable queue, assuming that an
|
|
% engine is available to pick it up. Measured in call sequence
|
|
% counts.
|
|
%
|
|
% We use this to calculate how soon a context can recover
|
|
% after being blocked by a future. It is also used to determine
|
|
% how quickly the context executing MR_join_and_continue after
|
|
% completing the leftmost conjunct of a parallel conjunction
|
|
% can recover after being blocked on the completion of
|
|
% one of the other conjuncts.
|
|
cpcp_context_wakeup_delay :: int,
|
|
|
|
% The cost threshold in call sequence counts of a clique
|
|
% before we consider it for parallel execution.
|
|
cpcp_clique_threshold :: int,
|
|
|
|
% The cost threshold in call sequence counts of a call site
|
|
% before we consider it for parallel execution.
|
|
cpcp_call_site_threshold :: int,
|
|
|
|
% The speedup we require before we allow a conjunction to be
|
|
% automatically parallelised. Should be either exactly 1.0
|
|
% or just above 1.0.
|
|
cpcp_speedup_threshold :: float,
|
|
|
|
% Whether we will allow parallelisation to result in
|
|
% dependent parallel conjunctions, and if so, how we estimate
|
|
% the speedup we get for them.
|
|
cpcp_parallelise_dep_conjs :: parallelise_dep_conjs,
|
|
|
|
cpcp_alg_for_best_par :: alg_for_finding_best_par
|
|
).
|
|
|
|
:- type parallelise_dep_conjs
|
|
---> do_not_parallelise_dep_conjs
|
|
; parallelise_dep_conjs(speedup_estimate_alg).
|
|
|
|
:- type speedup_estimate_alg
|
|
---> estimate_speedup_naively
|
|
% Be naive to dependent parallelism, pretend it is independent.
|
|
|
|
; estimate_speedup_by_overlap.
|
|
% Use the overlap calculation for dependent parallelism.
|
|
|
|
% This type is used to select which algorithm is used to find the most
|
|
% profitable parallelisation of a particular conjunction.
|
|
%
|
|
% TODO: The type name could be improved to make it distinct from the
|
|
% algorithm used to search through the clique graph.
|
|
%
|
|
:- type alg_for_finding_best_par
|
|
---> affbp_complete_branches(
|
|
% Use the complete algorithm until this many branches have been
|
|
% created during the search, and then fall back to the greedy
|
|
% algorithm. After such a fall back, all existing alternatives
|
|
% will be explored, but no new ones will be generated.
|
|
int
|
|
)
|
|
; affbp_complete_size(
|
|
% Use the complete algorithm for conjunctions with fewer than
|
|
% this many conjuncts, or a greedy algorithm. The recommended
|
|
% value is 50.
|
|
% XXX I (zs) think that 50 seems way too high.
|
|
int
|
|
)
|
|
; affbp_complete
|
|
% Use the complete brand-and-bound algorithm with no fallback.
|
|
; affbp_greedy.
|
|
% Use the linear greedy algorithm.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
% Represent the metrics of a parallel execution.
|
|
%
|
|
:- type parallel_exec_metrics
|
|
---> parallel_exec_metrics(
|
|
% The number of calls into this parallelisation.
|
|
pem_num_calls :: int,
|
|
|
|
% The elapsed time of the original sequential execution.
|
|
pem_seq_time :: float,
|
|
|
|
% The estimated elapsed time of the parallel execution.
|
|
pem_par_time :: float,
|
|
|
|
% The overheads of parallel execution. These are already
|
|
% included in pem_par_time. Overheads are separated into
|
|
% different causes.
|
|
pem_par_overhead_spark_cost :: float,
|
|
pem_par_overhead_barrier :: float,
|
|
pem_par_overhead_signals :: float,
|
|
pem_par_overhead_waits :: float,
|
|
|
|
% The amount of time the initial (left most) conjunct spends
|
|
% waiting for the other conjuncts. During this time,
|
|
% the context used by this conjunct must be kept alive
|
|
% because it will resume executing sequential code after
|
|
% the conjunct, however we know that it cannot be resumed
|
|
% before its children have completed.
|
|
pem_first_conj_dead_time :: float,
|
|
|
|
% The amount of time all conjuncts spend blocked on the
|
|
% production of futures.
|
|
pem_future_dead_time :: float
|
|
).
|
|
|
|
% The speedup per call: SeqTime / ParTime. For example, a value of 2.0
|
|
% means that the goal is twice as fast when parallelised.
|
|
%
|
|
:- func parallel_exec_metrics_get_speedup(parallel_exec_metrics) = float.
|
|
|
|
% The amount of time saved per-call: SeqTime - ParTime.
|
|
%
|
|
:- func parallel_exec_metrics_get_time_saving(parallel_exec_metrics) = float.
|
|
|
|
% The amount of time spent 'on cpu', (seq time plus non-dead overheads).
|
|
%
|
|
:- func parallel_exec_metrics_get_cpu_time(parallel_exec_metrics) = float.
|
|
|
|
% The overheads of parallel execution.
|
|
%
|
|
% Add these to pem_seq_time to get the 'time on cpu' of this execution.
|
|
%
|
|
:- func parallel_exec_metrics_get_overheads(parallel_exec_metrics) = float.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
% The set of candidate parallel conjunctions within a procedure.
|
|
%
|
|
:- type candidate_par_conjunctions_proc ==
|
|
candidate_par_conjunctions_proc(pard_goal).
|
|
:- type candidate_par_conjunctions_proc(GoalType)
|
|
---> candidate_par_conjunctions_proc(
|
|
% A variable name table for the variables that have
|
|
% sensible names.
|
|
cpcp_var_table :: var_name_table,
|
|
|
|
% Each push represents a program transformation.
|
|
% Most of the time, we expect the list to be empty,
|
|
% but if it isn't, then the list of candidate conjunctions
|
|
% is valid only AFTER the transformations described
|
|
% by this list have been applied. (The transformations
|
|
% should be independent of one another, so it should be
|
|
% OK to apply them in any order.)
|
|
cpcp_push_goals :: list(push_goal),
|
|
|
|
cpcp_par_conjs :: list(candidate_par_conjunction(GoalType))
|
|
).
|
|
|
|
% This goal describes 'push goal' transformations.
|
|
%
|
|
% This is where a goal may be pushed into the arms of a branching goal that
|
|
% occurs before it in the same conjunction. It can allow the pushed goal
|
|
% to be parallelised against goals in one or more branches without
|
|
% parallelising the whole branch goal (whose per-call cost may be
|
|
% too small).
|
|
%
|
|
:- type push_goal
|
|
---> push_goal(
|
|
% The goal path of the conjunction in which the push is done.
|
|
pg_goal_path :: goal_path_string,
|
|
|
|
% The range of conjuncts to push (inclusive).
|
|
pg_pushee_lo :: int,
|
|
pg_pushee_hi :: int,
|
|
|
|
% The set of expensive goals inside earlier conjuncts in that
|
|
% conjunction "next" to which the pushee goals should be
|
|
% pushed. By "next", we mean that the pushee goals should be
|
|
% added to the end of whatever conjunction contains the
|
|
% expensive goal, creating a containing conjunction if
|
|
% there wasn't one there before.
|
|
%
|
|
% Each of these expensive goals should be on a different
|
|
% execution path.
|
|
%
|
|
% This list should not be empty.
|
|
pg_pushed_into :: list(goal_path_string)
|
|
).
|
|
|
|
% A conjunction that is a candidate for parallelisation, it is identified
|
|
% by a procedure label, goal path to the conjunction and the call sites
|
|
% within the conjunction that are to be parallelised.
|
|
%
|
|
% TODO: In the future support more expressive candidate parallel
|
|
% conjunctions, so that more opportunities for parallelism can be found.
|
|
% Although it's probably not a good idea to parallelise three conjuncts or
|
|
% more against one another without first having a good system for reaching
|
|
% and maintaining the target amount of parallelism, this may involve
|
|
% distance granularity.
|
|
%
|
|
:- type candidate_par_conjunction(GoalType)
|
|
---> candidate_par_conjunction(
|
|
% The path within the procedure to this conjunction.
|
|
cpc_goal_path :: goal_path_string,
|
|
|
|
% If the candidate is dependent on a push being performed,
|
|
% what is that push? Note that any push that specifies the same
|
|
% goals being pushed and the same OR GREATER set of goals next
|
|
% to which to push them is acceptable: if such a push is
|
|
% performed, then this candidate is viable.
|
|
cpc_maybe_push_goal :: maybe(push_goal),
|
|
|
|
% The position within the original conjunction that this
|
|
% parallelisation starts.
|
|
cpc_first_conj_num :: int,
|
|
|
|
cpc_is_dependent :: conjuncts_are_dependent,
|
|
|
|
cpc_goals_before :: list(GoalType),
|
|
cpc_goals_before_cost :: float,
|
|
|
|
% A list of parallel conjuncts, each is a sequential
|
|
% conjunction of inner goals. All inner goals that are
|
|
% seen in the program presentation must be stored here
|
|
% unless they are to be scheduled before or after the
|
|
% sequential conjunction. If these conjuncts are flattened,
|
|
% the inner goals will appear in the same order as the
|
|
% program representation. By maintaining these two rules
|
|
% the compiler and analysis tools can use similar
|
|
% algorithms to construct the same parallel conjunction
|
|
% from the same program representation/HLDS structure.
|
|
|
|
cpc_conjs :: list(seq_conj(GoalType)),
|
|
|
|
cpc_goals_after :: list(GoalType),
|
|
cpc_goals_after_cost :: float,
|
|
|
|
cpc_par_exec_metrics :: parallel_exec_metrics
|
|
).
|
|
|
|
:- type conjuncts_are_dependent
|
|
---> conjuncts_are_dependent(set(var_rep))
|
|
; conjuncts_are_independent.
|
|
|
|
:- type seq_conj(GoalType)
|
|
---> seq_conj(
|
|
sc_conjs :: list(GoalType)
|
|
).
|
|
|
|
:- type callee_rep
|
|
---> unknown_callee
|
|
% An unknown callee such as a higher order or method call.
|
|
|
|
; named_callee(
|
|
% A known callee. Note that arity and mode are not stored at
|
|
% all. XXX why?
|
|
|
|
nc_module_name :: string,
|
|
nc_proc_name :: string
|
|
).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
% A parallelised goal (pard_goal), a goal within a parallel conjunction.
|
|
% We don't yet have to represent many types of goals or details about them.
|
|
%
|
|
:- type pard_goal == goal_rep(pard_goal_annotation).
|
|
|
|
:- type pard_goal_annotation
|
|
---> pard_goal_annotation(
|
|
% The per-call cost of this call in call sequence counts.
|
|
pga_cost_percall :: float,
|
|
|
|
pga_coat_above_threshold :: cost_above_par_threshold,
|
|
|
|
% Variable use information.
|
|
pga_var_productions :: assoc_list(var_rep, float),
|
|
pga_var_consumptions :: assoc_list(var_rep, float)
|
|
).
|
|
|
|
:- type cost_above_par_threshold
|
|
---> cost_above_par_threshold
|
|
% The goal has a significant enough cost to be considered for
|
|
% parallelisation.
|
|
|
|
; cost_not_above_par_threshold.
|
|
% The goal is too cheap to be considered for parallelisation.
|
|
% We track it in the feedback information to help inform the
|
|
% compiler about _how_ to parallelise calls around it.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- pred convert_candidate_par_conjunctions_proc(
|
|
pred(candidate_par_conjunction(A), A, B)::in(pred(in, in, out) is det),
|
|
candidate_par_conjunctions_proc(A)::in,
|
|
candidate_par_conjunctions_proc(B)::out) is det.
|
|
|
|
:- pred convert_candidate_par_conjunction(
|
|
pred(candidate_par_conjunction(A), A, B)::in(pred(in, in, out) is det),
|
|
candidate_par_conjunction(A)::in, candidate_par_conjunction(B)::out)
|
|
is det.
|
|
|
|
:- pred convert_seq_conj(
|
|
pred(A, B)::in(pred(in, out) is det),
|
|
seq_conj(A)::in, seq_conj(B)::out) is det.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- implementation.
|
|
|
|
:- import_module float.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
parallel_exec_metrics_get_speedup(PEM) = SeqTime / ParTime :-
|
|
SeqTime = PEM ^ pem_seq_time,
|
|
ParTime = PEM ^ pem_par_time.
|
|
|
|
parallel_exec_metrics_get_time_saving(PEM) = SeqTime - ParTime :-
|
|
SeqTime = PEM ^ pem_seq_time,
|
|
ParTime = PEM ^ pem_par_time.
|
|
|
|
parallel_exec_metrics_get_cpu_time(PEM) = SeqTime + Overheads :-
|
|
SeqTime = PEM ^ pem_seq_time,
|
|
Overheads = parallel_exec_metrics_get_overheads(PEM).
|
|
|
|
parallel_exec_metrics_get_overheads(PEM) =
|
|
SparkCosts + BarrierCosts + SignalCosts + WaitCosts :-
|
|
PEM = parallel_exec_metrics(_, _, _, SparkCosts, BarrierCosts,
|
|
SignalCosts, WaitCosts, _, _).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
%
|
|
% Helper predicates for the candidate parallel conjunctions type.
|
|
%
|
|
|
|
convert_candidate_par_conjunctions_proc(Conv, CPCProcA, CPCProcB) :-
|
|
CPCProcA = candidate_par_conjunctions_proc(VarTable, PushGoals, CPCA),
|
|
list.map(convert_candidate_par_conjunction(Conv), CPCA, CPCB),
|
|
CPCProcB = candidate_par_conjunctions_proc(VarTable, PushGoals, CPCB).
|
|
|
|
convert_candidate_par_conjunction(Conv0, CPC0, CPC) :-
|
|
CPC0 = candidate_par_conjunction(GoalPath, MaybePushGoal, FirstGoalNum,
|
|
IsDependent, GoalsBefore0, GoalsBeforeCost, Conjs0,
|
|
GoalsAfter0, GoalsAfterCost, Metrics),
|
|
Conv = (pred(A::in, B::out) is det :-
|
|
Conv0(CPC0, A, B)
|
|
),
|
|
list.map(convert_seq_conj(Conv), Conjs0, Conjs),
|
|
list.map(Conv, GoalsBefore0, GoalsBefore),
|
|
list.map(Conv, GoalsAfter0, GoalsAfter),
|
|
CPC = candidate_par_conjunction(GoalPath, MaybePushGoal, FirstGoalNum,
|
|
IsDependent, GoalsBefore, GoalsBeforeCost, Conjs,
|
|
GoalsAfter, GoalsAfterCost, Metrics).
|
|
|
|
convert_seq_conj(Conv, seq_conj(Conjs0), seq_conj(Conjs)) :-
|
|
list.map(Conv, Conjs0, Conjs).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
:- end_module mdbcomp.feedback.automatic_parallelism.
|
|
%---------------------------------------------------------------------------%
|