圆月山庄资源网 Design By www.vgjia.com
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2020/02/11 21:44 # @Author : dangxusheng # @Email : dangxusheng163@163.com # @File : download_by_href.py ''' 自动从arxiv.org 下载文献 ''' import os import os.path as osp import requests from lxml import etree from pprint import pprint import re import time import glob headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36", "Host": 'arxiv.org' } HREF_CN = 'http://cn.arxiv.org/pdf/' HREF_SRC = 'http://cn.arxiv.org/pdf/' SAVE_PATH = '/media/dangxs/E/Paper/download_at_20200730' os.makedirs(SAVE_PATH, exist_ok=True) FAIL_URLS = [] FAIL_URLS_TXT = f'{SAVE_PATH}/fail_urls.txt' def download(url, title): pattern = r'[\\/:*"\'<>|\r\n]+' new_title = re.sub(pattern, " ", title) print(f'new title: {new_title}') save_filepath = '%s/%s.pdf' % (SAVE_PATH, new_title) if osp.exists(save_filepath) and osp.getsize(save_filepath) > 50 * 1024: print(f'this pdf is be existed.') return True try: with open(save_filepath, 'wb') as file: # 分字节下载 r = requests.get(url, stream=True, timeout=None) for i in r.iter_content(2048): file.write(i) if osp.getsize(save_filepath) >= 10 * 1024: print('%s 下载成功.' % title) return True except Exception as e: print(e) return False # 从arxiv.org 去下载 def search(start_size=0, title_keywords='Facial Expression'): # 访问地址: https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1"title is-clearfix"]/text()') total_text = ''.join(total_text).replace('\n', '').lstrip(' ').strip(' ') # i.e. : Showing 1–50 of 355 results num = re.findall('\d+', total_text) # Sorry, your query returned no results if len(num) == 0: return [], 0 total = int(num[-1]) # 查询总条数 paper_list = html.xpath('//ol[@class="breathe-horizontal"]/li') info_list = [] for p in paper_list: title = p.xpath('./p[@class="title is-5 mathjax"]//text()') title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ') href = p.xpath('./div/p/a/@href')[0] info_list.append({'title': title, 'href': href}) return info_list, total # 去指定页面下载 def search_special(): res = requests.get('https://gitee.com/weberyoung/the-gan-zoo"file_content markdown-body"]//li') info_list = [] for p in paper_list: title = p.xpath('.//text()') title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ') href = p.xpath('./a/@href')[0] info_list.append({'title': title, 'href': href}) pprint(info_list) return info_list if __name__ == '__main__': page_idx = 0 total = 1000 keywords = 'Facial Action Unit' while page_idx <= total // 50: paper_list, total = search(page_idx * 50, keywords) print(f'total: {total}') if total == 0: print('no found .') exit(0) for p in paper_list: title = p['title'] href = HREF_CN + p['href'].split('/')[-1] + '.pdf' print(href) if not download(href, title): print('从国内镜像下载失败,从源地址开始下载 ') # 使用国际URL再下载一次 href = HREF_SRC + p['href'].split('/')[-1] + '.pdf' if not download(href, title): FAIL_URLS.append(p) page_idx += 1 # 下载最后的部分 last_1 = total - page_idx * 50 paper_list, total = search(last_1, keywords) for p in paper_list: title = p['title'] href = HREF_CN + p['href'].split('/')[-1] + '.pdf' if not download(href, title): FAIL_URLS.append(p) time.sleep(1) pprint(FAIL_URLS) with open(FAIL_URLS_TXT, 'a+') as f: for item in FAIL_URLS: href = item['href'] title = item['title'] f.write(href + '\n') print('done.')
以上就是python自动从arxiv下载paper的示例代码的详细内容,更多关于python 从arxiv下载paper的资料请关注其它相关文章!
圆月山庄资源网 Design By www.vgjia.com
广告合作:本站广告合作请联系QQ:858582 申请时备注:广告合作(否则不回)
免责声明:本站文章均来自网站采集或用户投稿,网站不提供任何软件下载或自行开发的软件! 如有用户或公司发现本站内容信息存在侵权行为,请邮件告知! 858582#qq.com
免责声明:本站文章均来自网站采集或用户投稿,网站不提供任何软件下载或自行开发的软件! 如有用户或公司发现本站内容信息存在侵权行为,请邮件告知! 858582#qq.com
圆月山庄资源网 Design By www.vgjia.com
暂无评论...
更新日志
2024年11月01日
2024年11月01日
- 群星《戏梦》[FLAC/分轨][308.4MB]
- 魔兽世界永久60级什么时间开 永久60级开启时间介绍
- 魔兽世界奥卡兹岛血藻刷新点一览 wlk奥卡兹岛血藻刷新位置介绍
- 英雄联盟s14中国队种子怎么排名 s14中国队种子队伍排名一览
- 柏菲·梁玉嵘《知音梁品》限量开盘母带ORMCD[低速原抓WAV+CUE]
- [东升唱片]孙露《寂寞诱惑AQCD》高码率[FLAC]
- 群星-第8届2012十大发烧唱片精选HQ2CD[WAV+CUE]
- 九号公司两轮电动安全季行动圆满收官:为行业树立安全管理新标杆
- 创新驱动未来 中国移动5G云游戏深化智能运维应用推动行业发展
- 大型手游“帧数拉满” 骁龙8至尊版游戏表现强悍
- 谢金燕.2002-YOYO姊妹【华特】【WAV+CUE】
- 群星.1994-摇滚列车【音乐家】【WAV+CUE】
- 陈艾湄.1996-牵绊【巨翼】【WAV+CUE】
- 群星《无法抗拒的谎言 电视剧原声带》[320K/MP3][69.98MB]
- 群星《无法抗拒的谎言 电视剧原声带》[FLAC/分轨][372.04MB]