Hemant Vishwakarma: PyTorch memory leak on loss.backward on both gpu as well as cpu

I've tried everything. gc.collect, torch.cuda.empty_cache, deleting every possible tensor and variable as soon as it is used, setting batch size to 1, nothing seems to work.

I'm working on text to code generation problem and utilizing the code from this repository : TranX

I've rewritten the data loader, model training pipeline and have made it as simple as i possibly can, but somehow it always runs out of memory.

Here's my data loader :

class DSET:
    def __init__(self, load_file='', mode='train', batch_size=16):
        self.load_file = load_file
        self.data = open(self.load_file, 'r', encoding='utf8').readlines()

        class EXAMPLE(object):
            def __init__(self, examples):
                self._examples = examples
                self.generator = AST('python')

            def generate_ast(self, code_text):
                tree = self.generator.generate_ast(code_text)
                return tree

            def get_actions(self, ast_tree):
                actions = self.generator.get_actions(ast_tree)
                return actions

            def __getitem__(self, index):
                line = json.loads(self._examples[index])

                try:
                    canonical_intent, slot_map = canonicalize_intent(
                        line['nl'])

                    canonical_code = canonicalize_code(line['code'], slot_map)
                    intent_tokens = tokenize_intent(canonical_intent)

                    python_ast = ast.parse(canonical_code)

                    canonical_code_source = astor.to_source(python_ast).strip()
                    target_ast = self.generate_ast(python_ast)

                    target_actions = self.get_actions(target_ast)

                    target_action_infos = get_action_infos(
                        intent_tokens, target_actions)

                    example = Example(src_sent=intent_tokens,
                                      tgt_actions=target_action_infos,
                                      tgt_code=canonical_code_source,
                                      tgt_ast=target_ast,
                                      meta=dict(example_dict=line,
                                                slot_map=slot_map))
                except Exception as e:
                    example = Example(src_sent=[''],
                                      tgt_actions=[None],
                                      tgt_code=[''],
                                      tgt_ast=None,
                                      meta={})

                return example

            def __setitem__(self, index, value):
                pass

        self.examples = EXAMPLE(self.data)

        self.num_examples = len(self.data)
        self.example_indices = np.arange(self.num_examples)
        self.iterator = 0
        self.batch_size = batch_size
        self.num_steps = self.num_examples // self.batch_size

    def batch_iter(self):
        indices = self.example_indices[self.iterator *
                                       self.batch_size:(self.iterator + 1) *
                                       self.batch_size]

        x = []

        for idx in indices:
            sample = self.examples[idx]
            x.append(sample)

        self.iterator += 1

        if self.iterator > len(self.example_indices) // self.batch_size:
            self.iterator = 0
            np.random.shuffle(self.example_indices)

        x.sort(key=lambda e: -len(e.src_sent))

        return x

    def __len__(self):
        return self.num_examples

Here's my training code :

def train(args):

    train_set = DSET(args.train_file, mode='train', batch_size=args.batch_size)

    validation_set = DSET(args.dev_file,
                          mode='valid',
                          batch_size=args.batch_size)

    vocab = pickle.load(open(args.vocab, 'rb'))

    grammar = ASDLGrammar.from_text(open(args.asdl_file).read())

    transition_system = Registrable.by_name(args.transition_system)(grammar)

    parser = Registrable.by_name(args.parser)

    model = parser(args, vocab, transition_system)

    model.train()

    evaluator = Registrable.by_name(args.evaluator)(transition_system,
                                                    args=args)

    if args.cuda:
        model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    nn_utils.glorot_init(model.parameters())

    print('Begining Training ...')
    print(
        f'Training Examples : {len(train_set)}, Validation Examples : {len(validation_set)}'
    )

    print(
        f'Number Of Epochs : {args.max_epoch}, Steps Per Epoch : {train_set.num_steps}, Batch Size : {args.batch_size}'
    )

    total_steps = 0

    for e in range(args.max_epoch):
        start_time = time.time()

        for iter in range(train_set.num_steps):
            total_steps += 1
            loss_val = 0

            batch = train_set.batch_iter()

            optimizer.zero_grad()

            loss = -model.score(batch)

            loss = torch.mean(loss)

            if iter % args.log_every == 0:
                loss_val = torch.sum(loss).detach().item()
                print(f'Iteration : {iter}, Loss : {loss_val}')

            loss.backward(retain_graph=False)

            optimizer.step()

            del loss
            del loss_val
            del batch

            gc.collect()

            torch.cuda.empty_cache()

            time.sleep(2)

        print(f'Epoch {e} : Time Taken : {time.time() - start_time}')

        model_file = args.save_to + '/' + f'model.epoch_{e}.bin'

        print(f'Saved Model To : {model_file}')

        model.save(model_file)

        torch.save(optimizer.state_dict(),
                   args.save_to + '/' + f'model_{e}.optim.bin')

        # Run Validation

        print('Running Validation...')

        try:
            eval_results = evaluate(validation_set.examples,
                                    model,
                                    evaluator,
                                    args,
                                    verbose=False,
                                    eval_top_pred_only=args.eval_top_pred_only)

            validation_score = eval_results[evaluator.default_metric]

            print(f'Evaluation Score: {validation_score}')

        except Exception as e:
            print(f'Could not validate: {e}')
            pass

The model.score method is custom by the repo author and i've added delete and gc.collect and torch.cuda.empty_cache lines throughout

I'm running pytorch 1.9.1 with cuda 11.1 on a 16gb GPU instance on aws ec2 with 32gb ram and ubuntu 18.04

I've re-written the code to make it more efficient as the code in the repository loaded the whole bin file of the dataset at once.

But i can't train the model, even with batch size of 1.

With batch size of 8, it crashes after 46 iterations, with batch size of 1 it goes upto 48k iterations but then crashes.

Traceback for GPU (After 46 Iterations):

Traceback (most recent call last):
  File "train.py", line 150, in <module>
    train(args)
  File "train.py", line 98, in train
    loss.backward(retain_graph=False)
  File "/usr/local/lib/python3.6/dist-packages/comet_ml/monkey_patching.py", lin                                                                                                                     e 312, in wrapper
    return_value = original(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/_tensor.py", line 255, in b                                                                                                                     ackward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=i                                                                                                                     nputs)
  File "/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py", line                                                                                                                      149, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 7.27 GiB (GPU 0; 14.76 GiB t                                                                                                                     otal capacity; 10.46 GiB already allocated; 2.98 GiB free; 10.52 GiB reserved in                                                                                                                      total by PyTorch)

Traceback for CPU (It simply says Killed) but goes upto iteration 672:

Iteration : 672, Loss : 702.1222534179688
Killed

Running dmesg gives the following output

[2059991.491436] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,me                                                                                                                     ms_allowed=0,global_oom,task_memcg=/user.slice,task=python3,pid=25315,uid=0
[2059991.491542] Out of memory: Killed process 25315 (python3) total-vm:53312244                                                                                                                     kB, anon-rss:31451456kB, file-rss:74816kB, shmem-rss:12296kB, UID:0 pgtables:690                                                                                                                     68kB oom_score_adj:0
[2059992.056260] oom_reaper: reaped process 25315 (python3), now anon-rss:0kB, f                                                                                                                     ile-rss:74732kB, shmem-rss:12296kB

Pytorch dataset and dataloader :

class T_DSET(Dataset):
    def __init__(self, load_file='', mode='train'):
        self.load_file = load_file
        self.data = open(self.load_file, 'r', encoding='utf-8').readlines()
        self.generator = AST('python')

    def generate_ast(self, code_text):
        tree = self.generator.generate_ast(code_text)
        return tree

    def get_actions(self, ast_tree):
        actions = self.generator.get_actions(ast_tree)
        return actions

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        line = json.loads(self.data[idx])

        try:
            canonical_intent, slot_map = canonicalize_intent(line['nl'])

            canonical_code = canonicalize_code(line['code'], slot_map)
            intent_tokens = tokenize_intent(canonical_intent)

            python_ast = ast.parse(canonical_code)

            canonical_code_source = astor.to_source(python_ast).strip()
            target_ast = self.generate_ast(python_ast)

            target_actions = self.get_actions(target_ast)

            target_action_infos = get_action_infos(intent_tokens,
                                                   target_actions)

            example = Example(src_sent=intent_tokens,
                              tgt_actions=target_action_infos,
                              tgt_code=canonical_code_source,
                              tgt_ast=target_ast,
                              meta=dict(example_dict=line, slot_map=slot_map))
        except Exception as e:
            example = Example(src_sent=[''],
                              tgt_actions=[None],
                              tgt_code=[''],
                              tgt_ast=None,
                              meta={})

        return example


def example_batch(batch):
    return batch

train_set = T_DSET(args.train_file, mode='train')

    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=example_batch)

Traceback (Iteration 39) :

Traceback (most recent call last):
  File "train.py", line 166, in <module>
    train(args)
  File "train.py", line 107, in train
    loss.backward(retain_graph=False)
  File "/usr/local/lib/python3.6/dist-packages/comet_ml/monkey_patching.py", line 312, in wrapper
    return_value = original(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py", line 149, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 8.09 GiB (GPU 0; 14.76 GiB total capacity; 10.98 GiB already allocated; 2.50 GiB free; 11.00 GiB reserved in total by PyTorch)

from PyTorch memory leak on loss.backward on both gpu as well as cpu

Hemant Vishwakarma

Wednesday, 20 October 2021

PyTorch memory leak on loss.backward on both gpu as well as cpu

No comments:

Post a Comment