起因是最近写论文,引用参考文献时图方便直接用arxiv的citation了,但是貌似这样不太好,已经发表的论文依然显示publisher是arxiv,所以就想写个工具转换一下.
首先我在网上找了找类似的工具,出现比较多的是yuchenlin/rebiber: A simple tool to update bib entries with their official information (e.g., DBLP or the ACL anthology). (github.com),还有个在线体验地址Rebiber - a Hugging Face Space by yuchenlin.
这个工具主要利用NLP会议的一些信息来转换,但是有些还是无法成功.
比如有一篇2023CVPR的论文我转换就失败了,依然显示arxiv的链接.
所以我就想了个办法(我之前也找到了其他人也用的类似这个方法,但我找不到链接了),使用谷歌学术上的引用.处理逻辑很简单,分析bib文件中的每个条目,看哪些是arxiv的然后就在谷歌学术上搜索并替换,当然不排除有的文章很牛但还是只有arxiv版本,特别是在深度学习中,一些大牛这样干.
上面的方法会出一些问题,比如爬着爬着就出现谷歌的验证,因为没有js,会提示不允许访问,需要进行验证,所以只能再变成selenium的方式,使用webdriver通过验证.
全部代码如下.1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128from pathlib import Path
import requests
import bibtexparser as bp
from lxml import etree
import time
from requests.compat import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
scholar_site = "https://scholar.google.com/scholar" # https://scholar.google.com/scholar
bib_url = "https://scholar.google.cz/scholar"
class Paser:
def __init__(self, parse_type='b'):
if parse_type == 'b':
edge_ops = Options()
# edge_ops.add_argument('--headless')
self.driver = webdriver.Edge(options=edge_ops)
self.flag = True
def search_thesis(self, title: str):
response = requests.get(scholar_site, params={'q': title, 'hl': 'zh-CN'})
root = etree.HTML(response.text)
nodes = root.xpath('//div[@id="gs_res_ccl_mid"]')
assert len(nodes) == 1, "No thesis found."
node = nodes[0]
first_theis = node.xpath('//div[contains(@class, "gs_r") and contains(@class, "gs_or") and contains(@class, '
'"gs_scl") and @data-cid]')
if len(first_theis) != 1:
raise Exception("No thesis found.")
data_cid = first_theis[0].get("data-cid")
res = requests.get(bib_url,
params={'q': f"info:{data_cid}:scholar.google.com/", 'output': 'cite', 'hl': 'zh-CN',
'scirp': '0'})
time.sleep(2)
return self.get_bib_from_scholar(res.text)
def get_bib_from_scholar(self, res: str):
root = etree.HTML(res)
bibtex_url = root.xpath('//div[@id="gs_citi"]')[0].xpath('//a')[0].get('href')
bib_res = requests.get(bibtex_url)
time.sleep(2)
return bib_res.text
def search_thesis_from_browser(self, title: str):
url_query = urljoin(scholar_site, f"?q={title}&hl='zh-CN'")
self.driver.get(url_query)
print("Searching thesis information.")
if self.flag:
try:
captcha = self.driver.find_element(By.XPATH, '//*[@id="gs_captcha_c"]')
if captcha:
print("Captcha detected.")
self.driver.implicitly_wait(100)
while True:
print("Please solve the captcha.")
if not captcha.is_displayed():
self.flag = False
break
except Exception as e:
self.flag = False
print("Retrieving thesis information.")
time.sleep(3)
ref = self.driver.find_element(By.XPATH, '//div[@id="gs_res_ccl_mid"]/*[1]//div[@class="gs_fl gs_flb"]/a[2]')
ref.click()
self.driver.implicitly_wait(10)
bib_link = self.driver.find_element(By.XPATH, '//*[@id="gs_citi"]/a[1]')
time.sleep(2)
bib_link.click()
self.driver.implicitly_wait(10)
time.sleep(2)
bib_text = self.driver.find_element(By.TAG_NAME, 'pre').text
return bib_text
def parse_bib(file_path: str, output_file_path: str = None):
global archiveprefix
if output_file_path is None:
output_file_path = Path(file_path).stem + "_parsed.bib"
if not Path(output_file_path).exists():
Path(output_file_path).touch()
with open(output_file_path, "w") as f:
f.write("")
bib_content = bp.parse_file(file_path)
if len(bib_content.failed_blocks) > 0:
raise Exception(
'\033[92m' + "Some blocks failed to parse. Check the entries of `library.failed_blocks`." + '\033[0m')
else:
print("All blocks parsed successfully")
parser = Paser()
for idx in range(len(bib_content.entries)):
try:
publisher = bib_content.entries[idx]['publisher']
publisher = publisher.replace("{", "").replace("}", "")
except Exception as e:
publisher = ""
try:
archiveprefix = bib_content.entries[idx]['archiveprefix']
archiveprefix = archiveprefix.replace("{", "").replace("}", "")
except Exception as e:
archiveprefix = ""
if publisher == "arXiv" or archiveprefix == "arxiv":
title = bib_content.entries[idx]['title']
title = title.replace("{{", "").replace("}}", "")
key = bib_content.entries[idx].key
try:
lib = bp.parse_string(parser.search_thesis_from_browser(title=title))
except Exception as e:
print(f"Error: {e}")
continue
lib.entries[0].key = key
new_bibtex_str = bp.write_string(lib)
with open(output_file_path, "a",encoding="utf-8") as f:
f.write(new_bibtex_str)
else:
lib = bp.Library()
lib.add(bib_content.entries[idx])
lib_str = bp.write_string(lib)
with open(output_file_path, "a", encoding="utf-8") as f:
f.write(lib_str)
assert len(bib_content.entries) == len(bp.parse_file(output_file_path).entries), "Some entries are missing."
print(f"File saved to {output_file_path}")
if __name__ == '__main__':
parse_bib('references.bib')
解析bib文件使用bibtexparser这个库.
这里我使用Edge浏览器,换成谷歌浏览器也可以,另外如果在国内,需要开全局代理.
处理完后建议检查一下.