# Copyright (c) Facebook, Inc. and its affiliates.
import logging
import math
from bisect import bisect_right
from typing import List
import torch
from fvcore.common.param_scheduler import (
CompositeParamScheduler,
ConstantParamScheduler,
LinearParamScheduler,
ParamScheduler,
)
logger = logging.getLogger(__name__)
[docs]class WarmupParamScheduler(CompositeParamScheduler):
"""
Add an initial warmup stage to another scheduler.
"""
[docs] def __init__(
self,
scheduler: ParamScheduler,
warmup_factor: float,
warmup_length: float,
warmup_method: str = "linear",
):
"""
Args:
scheduler: warmup will be added at the beginning of this scheduler
warmup_factor: the factor w.r.t the initial value of ``scheduler``, e.g. 0.001
warmup_length: the relative length (in [0, 1]) of warmup steps w.r.t the entire
training, e.g. 0.01
warmup_method: one of "linear" or "constant"
"""
end_value = scheduler(warmup_length) # the value to reach when warmup ends
start_value = warmup_factor * scheduler(0.0)
if warmup_method == "constant":
warmup = ConstantParamScheduler(start_value)
elif warmup_method == "linear":
warmup = LinearParamScheduler(start_value, end_value)
else:
raise ValueError("Unknown warmup method: {}".format(warmup_method))
super().__init__(
[warmup, scheduler],
interval_scaling=["rescaled", "fixed"],
lengths=[warmup_length, 1 - warmup_length],
)
[docs]class LRMultiplier(torch.optim.lr_scheduler._LRScheduler):
"""
A LRScheduler which uses fvcore :class:`ParamScheduler` to multiply the
learning rate of each param in the optimizer.
Every step, the learning rate of each parameter becomes its initial value
multiplied by the output of the given :class:`ParamScheduler`.
The absolute learning rate value of each parameter can be different.
This scheduler can be used as long as the relative scale among them do
not change during training.
Examples:
::
LRMultiplier(
opt,
WarmupParamScheduler(
MultiStepParamScheduler(
[1, 0.1, 0.01],
milestones=[60000, 80000],
num_updates=90000,
), 0.001, 100 / 90000
),
max_iter=90000
)
"""
# NOTES: in the most general case, every LR can use its own scheduler.
# Supporting this requires interaction with the optimizer when its parameter
# group is initialized. For example, classyvision implements its own optimizer
# that allows different schedulers for every parameter group.
# To avoid this complexity, we use this class to support the most common cases
# where the relative scale among all LRs stay unchanged during training. In this
# case we only need a total of one scheduler that defines the relative LR multiplier.
[docs] def __init__(
self,
optimizer: torch.optim.Optimizer,
multiplier: ParamScheduler,
max_iter: int,
last_iter: int = -1,
):
"""
Args:
optimizer, last_iter: See ``torch.optim.lr_scheduler._LRScheduler``.
``last_iter`` is the same as ``last_epoch``.
multiplier: a fvcore ParamScheduler that defines the multiplier on
every LR of the optimizer
max_iter: the total number of training iterations
"""
if not isinstance(multiplier, ParamScheduler):
raise ValueError(
"_LRMultiplier(multiplier=) must be an instance of fvcore "
f"ParamScheduler. Got {multiplier} instead."
)
self._multiplier = multiplier
self._max_iter = max_iter
super().__init__(optimizer, last_epoch=last_iter)
[docs] def state_dict(self):
# fvcore schedulers are stateless. Only keep pytorch scheduler states
return {"base_lrs": self.base_lrs, "last_epoch": self.last_epoch}
[docs] def get_lr(self) -> List[float]:
multiplier = self._multiplier(self.last_epoch / self._max_iter)
return [base_lr * multiplier for base_lr in self.base_lrs]
"""
Content below is no longer needed!
"""
# NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes
# only on epoch boundaries. We typically use iteration based schedules instead.
# As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean
# "iteration" instead.
# FIXME: ideally this would be achieved with a CombinedLRScheduler, separating
# MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it.
class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(
self,
optimizer: torch.optim.Optimizer,
milestones: List[int],
gamma: float = 0.1,
warmup_factor: float = 0.001,
warmup_iters: int = 1000,
warmup_method: str = "linear",
last_epoch: int = -1,
):
logger.warning(
"WarmupMultiStepLR is deprecated! Use LRMultipilier with fvcore ParamScheduler instead!"
)
if not list(milestones) == sorted(milestones):
raise ValueError(
"Milestones should be a list of" " increasing integers. Got {}", milestones
)
self.milestones = milestones
self.gamma = gamma
self.warmup_factor = warmup_factor
self.warmup_iters = warmup_iters
self.warmup_method = warmup_method
super().__init__(optimizer, last_epoch)
def get_lr(self) -> List[float]:
warmup_factor = _get_warmup_factor_at_iter(
self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
)
return [
base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch)
for base_lr in self.base_lrs
]
def _compute_values(self) -> List[float]:
# The new interface
return self.get_lr()
class WarmupCosineLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(
self,
optimizer: torch.optim.Optimizer,
max_iters: int,
warmup_factor: float = 0.001,
warmup_iters: int = 1000,
warmup_method: str = "linear",
last_epoch: int = -1,
):
logger.warning(
"WarmupCosineLR is deprecated! Use LRMultipilier with fvcore ParamScheduler instead!"
)
self.max_iters = max_iters
self.warmup_factor = warmup_factor
self.warmup_iters = warmup_iters
self.warmup_method = warmup_method
super().__init__(optimizer, last_epoch)
def get_lr(self) -> List[float]:
warmup_factor = _get_warmup_factor_at_iter(
self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
)
# Different definitions of half-cosine with warmup are possible. For
# simplicity we multiply the standard half-cosine schedule by the warmup
# factor. An alternative is to start the period of the cosine at warmup_iters
# instead of at 0. In the case that warmup_iters << max_iters the two are
# very close to each other.
return [
base_lr
* warmup_factor
* 0.5
* (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters))
for base_lr in self.base_lrs
]
def _compute_values(self) -> List[float]:
# The new interface
return self.get_lr()
def _get_warmup_factor_at_iter(
method: str, iter: int, warmup_iters: int, warmup_factor: float
) -> float:
"""
Return the learning rate warmup factor at a specific iteration.
See :paper:`ImageNet in 1h` for more details.
Args:
method (str): warmup method; either "constant" or "linear".
iter (int): iteration at which to calculate the warmup factor.
warmup_iters (int): the number of warmup iterations.
warmup_factor (float): the base warmup factor (the meaning changes according
to the method used).
Returns:
float: the effective warmup factor at the given iteration.
"""
if iter >= warmup_iters:
return 1.0
if method == "constant":
return warmup_factor
elif method == "linear":
alpha = iter / warmup_iters
return warmup_factor * (1 - alpha) + alpha
else:
raise ValueError("Unknown warmup method: {}".format(method))