Contents
目的
AI (機械学習)用の画像を取得しようと思ったら、意外に引っかかる。
PythonでPinterestの検索を行い、その結果の画像のURL等の情報を取得した。
Pinterest APIについては、検索ができるのは自分のアカウントのBoardとPinだけらしい。
Pinterest APIではすべてのPinを対象にしたい。
AI (機械学習)用の画像は、参考2のような、公開するデータセットを利用する方法もある。
コード
参考1のコードそのままだが
import os, sys, time import requests import json import bs4 # beautifulSoupe4 import re # for "findall" # Save an image file def save_image(file_name, image): with open(file_name, 'wb') as f: f.write(image) def search(query, num_pins): # First access url = 'https://www.pinterest.jp/search/pins/' headers = { 'connection': 'keep-alive' } search_response = requests.get(url, params={'q':query}, headers=headers, stream=False) soup = bs4.BeautifulSoup(search_response.text.replace('\n',''), 'html5lib') data_json_string = soup.find('script', type='application/json') # extract json string data_json = json.loads(data_json_string.string) # convert into dictionary type variable results = data_json['tree']['children'][0]['data']['results'] # results = data_json['resouceDataCache'][0]['children'][0]['data']['results'] image_info_list = [] for r in results: image_info = {} image_info['description'] = r['description'] image_info['link'] = r['link'] image_info['image_url'] = r['images']['orig']['url'] image_info['id'] = r['id'] image_info_list.append(image_info) # Second or later access to load additional pins that are responded as a JSON string url = 'https://www.pinterest.jp/resource/BaseSearchResource/get/' bookmarks = data_json['resourceDataCache'][0]['resource']['options']['bookmarks'] experiment_hash = data_json['context']['triggerable_experiments_hash'] last_cookies = search_response.cookies while len(image_info_list) < num_pins: ## Preparing parameters, headers and cookies for the "get" request params = { 'source_url':'/search/pins/?q={}'.format(query), 'data':json.dumps({ 'options':{ 'bookmarks':bookmarks, 'query':query, 'scope':'pins', 'page_size':25, 'field_set_key':'unauth_react' }, 'context':{}}), '_':str(int(time.time())*10*10*10) } headers = { 'Host':'www.pinterest.jp', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Accept-Language':'ja,en-US;q=0.7,en;q=0.3', 'X-Pinterest-AppState': 'background', 'X-Pinterest-ExperimentHash': experiment_hash, 'X-NEW-APP':'1', 'X-APP-VERSION':'9b11f84', 'X-Requested-With':'XMLHttpRequest', 'Referer':'https://www.pinterest.jp', 'cookie':json.dumps({ '_auth':dict(last_cookies)['_auth'], 'csrftoken':dict(last_cookies)['csrftoken'], '_pinterest_sess':dict(last_cookies)['_pinterest_sess']}), 'connection':'keep-alive' } cookies = { '_auth':dict(last_cookies)['_auth'], 'csrftoken':dict(last_cookies)['csrftoken'], '_pinterest_sess':dict(last_cookies)['_pinterest_sess'], 'bei':'False', 'logged_out':'True', 'fba':'True', 'sessionFunelEventLogged':'1' } search_response = requests.get(url, cookies=cookies, params=params, headers=headers, stream=False) data_json = json.loads(search_response.text) results = data_json['resource_response']['data']['results'] bookmarks = data_json['resource']['options']['bookmarks'] experiment_hash = data_json['client_context']['triggerable_experiments_hash'] last_cookies = search_response.cookies for r in results: image_info = {} image_info['description'] = r['description'] image_info['link'] = r['link'] image_info['image_url'] = r['images']['orig']['url'] image_info['id'] = r['id'] image_info_list.append(image_info) return image_info_list def main(argv): keyword = 'xxx' # keyword you want to search num_pins = 100 # Number of pins searched img_dir = 'images' timeout = 10 # in second params = {} # not used cookies = {} # not used headers = {} # not used image_info_list = search(keyword, num_pins) for img_info in image_info_list: img_url = img_info['image_url'] # Retrieve the file name of the image name_search = re.findall(r'\/([a-zA-Z0-9:.=_-]*jpg|jpeg|JPG|JPEG)', img_url) img_name = name_search[0] # Get the content of the image img_response = requests.get(img_url, timeout=timeout, params=params, cookies=cookies, headers=headers, stream=False) if img_response.raise_for_status() != None: sys.exit('HTTP Error When Accessing The Image File!') # if not suceessed, this script will be terminated # Save the image save_image('./'+img_dir+'/'+img_name, img_response.content) if __name__ == '__main__': main(sys.argv)
search関数に検索ワードと取得したい画像数をいれると、結果はimagesに保存される!
参考
- http://hassiweb-programming.blogspot.com/2017/07/retrieve-pinterest-pins-by-python.html — PythonでPinterestのPin (画像)の検索結果を取得する
- https://ai.google/tools/datasets/