Python-based scientific computing package targeted at two sets of audiences:
import torch
print(torch.__version__)
List of functions in torch: https://pytorch.org/docs/stable/torch.html
similar to NumPy’s ndarrays, can also be used on a GPU to accelerate computing.
Note uninitialized matrix contains whatever values were in the allocated memory at the time
x = torch.empty(5, 3)
print(x)
# randomly initialized matrix
x = torch.rand(5, 3)
print(x)
# matrix filled zeros and of dtype long
x = torch.zeros(5, 3, dtype=torch.long)
print(x)
# Construct a tensor directly from data
x = torch.tensor([5.5, 3])
print(x)
# Create a tensor based on an existing tensor.
# Reuse or change properties of the input tensor, e.g. dtype
x = x.new_ones(5, 3, dtype=torch.double) # new_* methods take in sizes
print(x)
x = torch.randn_like(x, dtype=torch.float) # override dtype!
print(x) # result has the same size
print(x.size()) # returns a tuple
print(x[:, 1])
print(x[1,:])
print(x[1,[True, False, True]])
x = torch.randn(4, 4)
y = x.view(16)
print(y)
print(x.size(), y.size())
z = x.view(-1, 8) # the size -1 is inferred from other dimensions
print(x.size(), z.size())
x[0,0]
x[0,0].item()
Note that this is a reference to the tensor, not a copy
y = x.numpy()
print(y)
x.add_(10) # changing the torch tensor
print(x)
print(y) # y is changed also
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)
Tensors can be moved onto any device using the .to method.
# let us run this cell only if CUDA is available
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
device = torch.device("cuda") # a CUDA device object
y = torch.ones_like(x, device=device) # directly create a tensor on GPU
x = x.to(device) # or just use strings ``.to("cuda")``
z = x + y
print(z)
print(z.to("cpu", torch.double)) # ``.to`` can also change dtype together!
There are multiple syntaxes for operations
Note: trailing underscore means in-place operation for tensor, .copy(), .t()
x = torch.rand(3, 2)
y = torch.rand(3, 2)
print(x + y)
print(torch.add(x, y))
result = torch.empty(5, 3)
torch.add(x, y, out=result)
print(result)
y.add_(x)
print(y)
x = torch.ones(2, 2, requires_grad=True)
print(x)
y = x + 2
print(y)
print(y.grad_fn)
z = y * y * 3
out = z.mean()
print(z, out)
$X = \begin{pmatrix}1&1\\1&1\end{pmatrix}$ ...requires_grad=true means keep track of $X$ in subsequent operations
$Y = X + 2 = \begin{pmatrix}3&3\\3&3\end{pmatrix}$
$Z = Y \odot Y \times 3 = \begin{pmatrix}27&27\\27&27\end{pmatrix}$ ...notice elementwise multiplication, not matmul
out = mean$(Z) = 27$ ...mean over all elements, unlike matlab
# .requires_grad_( ... ) changes an existing Tensor’s requires_grad flag in-place.
# The input flag defaults to False if not given.
a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad) # without trailing underscore it is the attribute not a function
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)
# Perform backprop now, get gradients d(out)/dx
# Because out contains a single scalar, out.backward() is equivalent to
# out.backward(torch.tensor(1.)).
out.backward()
print(x.grad)
$\frac{\partial out}{\partial x_i}$ = $\frac{\partial out}{\partial z_i}$ $\frac{\partial z_i}{\partial x_i}$
$\frac{\partial out}{\partial x_i}$ = $\frac{\partial out}{\partial z_i}\frac{\partial z_i}{\partial y_i}\frac{\partial y_i}{\partial x_i}$ = $\frac{1}{4} \times$ $6 y_i$ $\times 1$ = $\frac{6}{4} (3)$ = $\frac{18}{4} = 4.5$
Matrix derivative for Hadamard products: $ d (A \odot B) = dA \odot B + A \odot dB$, so $d (Y \odot Y) = (\mathbf 1 \mathbf 1^T) \odot Y + Y \odot (\mathbf 1 \mathbf 1^T) = 2 Y$
Matrix derivative for quadratic form $\frac{\partial}{\partial A} (\mathbf x^T A \mathbf y) = \mathbf x \mathbf y^T$ so $\frac{\partial}{\partial Z} (\mathbf 1^T Z \mathbf 1) = \mathbf 1 \mathbf 1^T$
$\frac{\partial out}{\partial X}$ = $\frac{\partial out}{\partial Z}$ $\frac{\partial Z}{\partial X}$
$\frac{\partial out}{\partial X}$ = $\frac{\partial out}{\partial Z}\frac{\partial Z}{\partial Y}\frac{\partial Y}{\partial X}$ = $(\frac{1}{4} \mathbf 1 \mathbf 1^T) \times (6 Y) \times (1)$ = $\frac{6}{4} (3)$ = $\frac{18}{4} = 4.5$
....??
Vector derivative for Hadamard products must differ...: $ d (a \odot b) = ??$,
$\frac{\partial y}{\partial y} = I$
$\frac{\partial out}{\partial X}$ = $\frac{\partial out}{\partial Z}$ $\frac{\partial Z}{\partial X}$
$\frac{\partial out}{\partial X}$ = $\frac{\partial out}{\partial Z}\frac{\partial Z}{\partial Y}\frac{\partial Y}{\partial X}$ = $(\frac{1}{4} \mathbf 1 \mathbf 1^T) \times (6 Y) \times (1)$ = $\frac{6}{4} (3)$ = $\frac{18}{4} = 4.5$
....??
"torch.autograd is an engine for computing vector-Jacobian products"
$$ \mathbf J^T \mathbf v = \begin{pmatrix} \frac{\partial y_1}{\partial x_1} & \dots & \frac{\partial y_1}{\partial x_n} \\ \vdots & \ddots & \vdots \\ \frac{\partial y_m}{\partial x_1} & \dots & \frac{\partial y_m}{\partial x_n} \end{pmatrix} \begin{pmatrix} v_1 \\ \vdots \\ v_n \end{pmatrix} $$Suppose there is a scalar function $g(\mathbf y)$,
with gradient $\frac{\partial g}{\partial \mathbf y} = \begin{pmatrix} \frac{\partial g}{\partial y_1} \\ \vdots \\ \frac{\partial g}{\partial y_m} \end{pmatrix} = \mathbf v$
where $\mathbf y = f(\mathbf x)$
Then by the chain rule, $\frac{\partial g}{\partial \mathbf x} = \frac{\partial g}{\partial \mathbf y} \frac{\partial \mathbf y}{\partial \mathbf x} = \mathbf J^T \mathbf v$
....work out the notation carefully...
x = torch.randn(3, requires_grad=True)
y = x * 2
while y.data.norm() < 1000:
y = y * 2
print(y)
if we just want the vector-Jacobian product, pass the vector to backward as argument:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(v)
print(x.grad)
## Prevent tracking for efficiency
print(x.requires_grad)
print((x ** 2).requires_grad)
with torch.no_grad():
print((x ** 2).requires_grad)
https://pytorch.org/docs/stable/autograd.html#function
dir(torch.autograd.Function)
# extend it for your own custom function by changing the forward and backward methods
class Exp(torch.autograd.Function):
@staticmethod
def forward(ctx, i):
result = i.exp()
ctx.save_for_backward(result)
return result
@staticmethod
def backward(ctx, grad_output):
result, = ctx.saved_tensors
return grad_output * result
https://pytorch.org/docs/stable/_modules/torch/autograd/function.html#Function.forward
https://pytorch.org/docs/stable/_modules/torch/autograd/function.html#Function.backward
Defines a formula for differentiating the operation.
This function is to be overridden by all subclasses.
It must accept a context ctx as the first argument, followed by as many outputs did forward() return,
Each returned value should be the gradient w.r.t. the corresponding input.
The context can be used to retrieve tensors saved during the forward pass.
The staticmethod() built-in function returns a static method for a given function.
Using staticmethod() is considered un-Pythonic way of creating a static function.
So, in newer versions of Python, you can use the Python decorator @staticmethod.
The syntax of @staticmethod is:
@staticmethod
def func(args, ...)
Static methods, much like class methods, are methods that are bound to a class rather than its object.
They do not require a class instance creation. So, are not dependent on the state of the object.
The difference between a static method and a class method is:
They can be called both by the class and its object.
Class.staticmethodFunc()
or even
Class().staticmethodFunc()
metaprogramming = part of the program tries to modify another part of the program at compile time.
Decorator is a function which takes a function as input and returns a function
A kind of "higher order functions"
def add1(x):
return x + 1
dir(add1)
The $\verb|__call__()|$ method tells us this class is callable (i.e. a "function")
def divide(a, b):
return a/b
print(divide(2,5))
print(divide(2,0))
# this function blocks input functions from getting a zero as second input
# ...returns none if second inut is zero
# ...otherwise calls input function and returns its output
def zeroblocker(func):
def inner(a,b):
print("checking input",b)
if b == 0:
print("Whoops! b=0, doing nothing")
return
return func(a,b)
return inner
# functional approach to using
fixeddivide = zeroblocker(divide)
print(fixeddivide(2,0))
# "pythonic" way that implements at function definition time to simplify code
@zeroblocker
def divide(a,b):
return a/b
print(divide(2,0))
# example with nested ("chained") decorators and variable number of args
def star(func):
def inner(*args, **kwargs):
print("*" * 30)
func(*args, **kwargs)
print("*" * 30)
return inner
def percent(func):
def inner(*args, **kwargs):
print("%" * 30)
func(*args, **kwargs)
print("%" * 30)
return inner
@star
@percent
def printer(msg):
print(msg)
printer("Hello")
import torch
from torch.autograd import Variable
a = Variable(torch.rand(1, 4), requires_grad=True)
b = a**2
c = b*2
d = c.mean()
e = c.sum()
a
b
c
%load_ext tensorboard
%tensorboard --logdir logs
make_dot(e)
import torch
from torch import nn
model = nn.Sequential()
model.add_module('W0', nn.Linear(8, 16))
model.add_module('tanh', nn.Tanh())
model.add_module('W1', nn.Linear(16, 1))
x = torch.randn(1,8)
make_dot(model(x))
Just pasting the code below rather than installing and importing it as a module
https://github.com/szagoruyko/pytorchviz/blob/master/torchviz/dot.py
Run this before above calls to make_dot()
# https://github.com/szagoruyko/pytorchviz/blob/master/torchviz/dot.py
from collections import namedtuple
from distutils.version import LooseVersion
from graphviz import Digraph
import torch
from torch.autograd import Variable
Node = namedtuple('Node', ('name', 'inputs', 'attr', 'op'))
def make_dot(var, params=None):
""" Produces Graphviz representation of PyTorch autograd graph.
Blue nodes are the Variables that require grad, orange are Tensors
saved for backward in torch.autograd.Function
Args:
var: output Variable
params: dict of (name, Variable) to add names to node that
require grad (TODO: make optional)
"""
if params is not None:
assert all(isinstance(p, Variable) for p in params.values())
param_map = {id(v): k for k, v in params.items()}
node_attr = dict(style='filled',
shape='box',
align='left',
fontsize='12',
ranksep='0.1',
height='0.2')
dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
seen = set()
def size_to_str(size):
return '(' + (', ').join(['%d' % v for v in size]) + ')'
output_nodes = (var.grad_fn,) if not isinstance(var, tuple) else tuple(v.grad_fn for v in var)
def add_nodes(var):
if var not in seen:
if torch.is_tensor(var):
# note: this used to show .saved_tensors in pytorch0.2, but stopped
# working as it was moved to ATen and Variable-Tensor merged
dot.node(str(id(var)), size_to_str(var.size()), fillcolor='orange')
elif hasattr(var, 'variable'):
u = var.variable
name = param_map[id(u)] if params is not None else ''
node_name = '%s\n %s' % (name, size_to_str(u.size()))
dot.node(str(id(var)), node_name, fillcolor='lightblue')
elif var in output_nodes:
dot.node(str(id(var)), str(type(var).__name__), fillcolor='darkolivegreen1')
else:
dot.node(str(id(var)), str(type(var).__name__))
seen.add(var)
if hasattr(var, 'next_functions'):
for u in var.next_functions:
if u[0] is not None:
dot.edge(str(id(u[0])), str(id(var)))
add_nodes(u[0])
if hasattr(var, 'saved_tensors'):
for t in var.saved_tensors:
dot.edge(str(id(t)), str(id(var)))
add_nodes(t)
# handle multiple outputs
if isinstance(var, tuple):
for v in var:
add_nodes(v.grad_fn)
else:
add_nodes(var.grad_fn)
resize_graph(dot)
return dot
# For traces
def replace(name, scope):
return '/'.join([scope[name], name])
def parse(graph):
scope = {}
for n in graph.nodes():
inputs = [i.uniqueName() for i in n.inputs()]
for i in range(1, len(inputs)):
scope[inputs[i]] = n.scopeName()
uname = next(n.outputs()).uniqueName()
assert n.scopeName() != '', '{} has empty scope name'.format(n)
scope[uname] = n.scopeName()
scope['0'] = 'input'
nodes = []
for n in graph.nodes():
attrs = {k: n[k] for k in n.attributeNames()}
attrs = str(attrs).replace("'", ' ')
inputs = [replace(i.uniqueName(), scope) for i in n.inputs()]
uname = next(n.outputs()).uniqueName()
nodes.append(Node(**{'name': replace(uname, scope),
'op': n.kind(),
'inputs': inputs,
'attr': attrs}))
for n in graph.inputs():
uname = n.uniqueName()
if uname not in scope.keys():
scope[uname] = 'unused'
nodes.append(Node(**{'name': replace(uname, scope),
'op': 'Parameter',
'inputs': [],
'attr': str(n.type())}))
return nodes
def make_dot_from_trace(trace):
""" Produces graphs of torch.jit.trace outputs
Example:
>>> trace, = torch.jit.trace(model, args=(x,))
>>> dot = make_dot_from_trace(trace)
"""
# from tensorboardX
if LooseVersion(torch.__version__) >= LooseVersion("0.4.1"):
torch.onnx._optimize_trace(trace, torch._C._onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
elif LooseVersion(torch.__version__) >= LooseVersion("0.4"):
torch.onnx._optimize_trace(trace, False)
else:
torch.onnx._optimize_trace(trace)
graph = trace.graph()
list_of_nodes = parse(graph)
node_attr = dict(style='filled',
shape='box',
align='left',
fontsize='12',
ranksep='0.1',
height='0.2')
dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
for node in list_of_nodes:
dot.node(node.name, label=node.name.replace('/', '\n'))
if node.inputs:
for inp in node.inputs:
dot.edge(inp, node.name)
resize_graph(dot)
return dot
def resize_graph(dot, size_per_element=0.15, min_size=12):
"""Resize the graph according to how much content it contains.
Modify the graph in place.
"""
# Get the approximate number of nodes and edges
num_rows = len(dot.body)
content_size = num_rows * size_per_element
size = max(min_size, content_size)
size_str = str(size) + "," + str(size)
dot.graph_attr.update(size=size_str)