爬虫代码备份

Posted on
#!/usr/bin/python3
import requests
import re
r = requests.get('http://jandan.net/ooxx/')
html = r.text
result = re.findall(r'<p><a href="(.+?)" target="_blank" class="view_img_link">',html)
#print(result)
for url in result:
    print(url)

 

#!/usr/bin/python3
import requests
import re
import os
r = requests.get('此处改为minkchan想抓的页面')
html = r.text
result = re.findall(r'href="(.+?[^html])" target="_blank">',html)
#print(result)

title = re.findall(r'<title>(.+) \| みんくちゃんねる</title>',html)
pwd = os.getcwd()
print (title)
for x in title:
    os.mkdir(x)
    os.chdir(pwd+'/'+x)

for url in result:
    filename = re.search(r'[^/]+\.((jpg)|(gif))',url).group(0)
    r = requests.get(url)
    pic = r.content
    with open(filename,'wb') as file:
        file.write(pic)
#!/usr/bin/python3
import requests
import re
import os
import sys
targeturl = sys.argv[1]
r = requests.get(sys.argv[1])
html = r.text
result = re.findall(r'href="(.+?[^html])" target="_blank">',html)
#print(result)

title = re.findall(r'<title>(.+) \| みんくちゃんねる</title>',html)
pwd = os.getcwd()
print (title)
for x in title:
    os.mkdir(x)
    os.chdir(pwd+'/'+x)

for url in result:
    filename = re.search(r'[^/]+\.((jpg)|(gif)|(mp4))',url).group(0)
    r = requests.get(url)
    pic = r.content
    with open(filename,'wb') as file:
        file.write(pic)

 

第二版只需要执行的时候戴上某站网址就行

#例
python3 minkchan.py http://

 

存为.py 使用前chmod +x

这网站对新手真是友好 爬虫各种简单

郁闷闷的博客

发表评论

电子邮件地址不会被公开。 必填项已用*标注