百科问答小站 logo
百科问答小站 font logo



使用pytorch时,训练集数据太多达到上千万张,Dataloader加载很慢怎么办? 第1页

  

user avatar   fang-niu-wa-28-17 网友的相关建议: 
      

下面是我见到过的写得最优雅的,预加载的dataloader迭代方式可以参考下:

使用方法就和普通dataloder一样 for xxx in trainloader .

主要思想就两点 , 第一重载 _iter 和 next_ ,第二点多线程异步Queue加载

       import numbers import os import queue as Queue import threading  import mxnet as mx import numpy as np import torch from torch.utils.data import DataLoader, Dataset from torchvision import transforms   class BackgroundGenerator(threading.Thread):     def __init__(self, generator, local_rank, max_prefetch=6):         super(BackgroundGenerator, self).__init__()         self.queue = Queue.Queue(max_prefetch)         self.generator = generator         self.local_rank = local_rank         self.daemon = True         self.start()      def run(self):         torch.cuda.set_device(self.local_rank)         for item in self.generator:             self.queue.put(item)         self.queue.put(None)      def next(self):         next_item = self.queue.get()         if next_item is None:             raise StopIteration         return next_item      def __next__(self):         return self.next()      def __iter__(self):         return self   class DataLoaderX(DataLoader):     def __init__(self, local_rank, **kwargs):         super(DataLoaderX, self).__init__(**kwargs)         self.stream = torch.cuda.Stream(local_rank)         self.local_rank = local_rank      def __iter__(self):         self.iter = super(DataLoaderX, self).__iter__()         self.iter = BackgroundGenerator(self.iter, self.local_rank)         self.preload()         return self      def preload(self):         self.batch = next(self.iter, None)         if self.batch is None:             return None         with torch.cuda.stream(self.stream):             for k in range(len(self.batch)):                 self.batch[k] = self.batch[k].to(device=self.local_rank,                                                  non_blocking=True)      def __next__(self):         torch.cuda.current_stream().wait_stream(self.stream)         batch = self.batch         if batch is None:             raise StopIteration         self.preload()         return batch   class MXFaceDataset(Dataset):     def __init__(self, root_dir, local_rank):         super(MXFaceDataset, self).__init__()         self.transform = transforms.Compose(             [transforms.ToPILImage(),              transforms.RandomHorizontalFlip(),              transforms.ToTensor(),              transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),              ])         self.root_dir = root_dir         self.local_rank = local_rank         path_imgrec = os.path.join(root_dir, 'train.rec')         path_imgidx = os.path.join(root_dir, 'train.idx')         self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')         s = self.imgrec.read_idx(0)         header, _ = mx.recordio.unpack(s)         if header.flag > 0:             self.header0 = (int(header.label[0]), int(header.label[1]))             self.imgidx = np.array(range(1, int(header.label[0])))         else:             self.imgidx = np.array(list(self.imgrec.keys))      def __getitem__(self, index):         idx = self.imgidx[index]         s = self.imgrec.read_idx(idx)         header, img = mx.recordio.unpack(s)         label = header.label         if not isinstance(label, numbers.Number):             label = label[0]         label = torch.tensor(label, dtype=torch.long)         sample = mx.image.imdecode(img).asnumpy()         if self.transform is not None:             sample = self.transform(sample)         return sample, label      def __len__(self):         return len(self.imgidx)     




  

相关话题

  如何评价华为天才少年钟钊团队成功开启了 AutoML 大规模商用的先河?什么是 AutoML? 
  为什么很少人用FFT加速CNN卷积层的运算? 
  小样本学习中关于虚拟样本有效性的问题? 
  有什么算法能对一个长短不一的时间序列进行分类预测? 
  人脑有海量的神经元(参数),那么人脑有没有「过拟合」行为? 
  了解/从事机器学习/深度学习系统相关的研究需要什么样的知识结构? 
  如何看待旷视科技新产品监视学生上课? 
  为什么Bert中的CLS在未fine tune时作为sentence embedding性能非常糟糕? 
  如何评价论文「Stochastic Training is Not Necessary ...」? 
  机器学习能否用于综合评价?具体怎么操作? 

前一个讨论
如何看待上海市科委、中科院上海有机所和观视频联合制作的科普微电影《无处不在的手性之有机师姐》?
下一个讨论
表哥说机械比计算机经管都好,如何看待他的言论?





© 2025-04-04 - tinynew.org. All Rights Reserved.
© 2025-04-04 - tinynew.org. 保留所有权利