【python】Amazon商品をwebスクレイピングしてみた。GoogleスプレッドシートにASINなど取得して書き込みできるよ

更新日：2022年5月4日

とりあえず、出来上がったコードを張っておきます。（動作確認レベルで、リファクタリングすらしてない）

必要なライブラリなどは後で解説します

2022/5/4更新

上から順にコード詰め込んだだけなので、汚いですが。

from bs4 import BeautifulSoup
import urllib.request as req
import requests
import lxml.html
import re
import pandas as pd
import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials

#GoogleからもらったJSONファイルで、スコープのAPIが使えるように認証情報を取得
SCOPES = &#91;'https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
SERVICE_ACCOUNT_FILE ='秘密鍵情報はgoogleAPIから取得しよう！.json'

credentials = ServiceAccountCredentials.from_json_keyfile_name(SERVICE_ACCOUNT_FILE, SCOPES)

#認証情報を使って、スプレッドシート の操作券を取得
gs = gspread.authorize(credentials)

#共有したスプレッドシート のキー(スプレッドシート のURLの真ん中ら辺)を使ってシートの情報を取得
SPREADSHEET_KEY = '書き込みたいスプレッドシート のURL'
worksheet = gs.open_by_key(SPREADSHEET_KEY).worksheet('書き込みたいシートの名前を入れる')

#値がとれてるかとりあえずプリントしてみる
print(worksheet.acell('R9').value)

workbook = gs.open_by_key(SPREADSHEET_KEY)
worksheet = workbook.worksheet('12')
print(workbook.title)
print(workbook.id)
print(worksheet)
df = pd.DataFrame(worksheet.get_all_values())
df.head()
#行と列の数を取り出す
sh=df.shape
#shのprint表示
print(sh)
print(sh&#91;1])
#shapeの列の数だけrangeで振り直してcolumnsに代入
df.columns=range(sh&#91;1])
df.head()
df.dtypes
#ここで数字以外は空白にする　　空白は次のreplace,dropnaで消す

df&#91;2]= df&#91;2].str.replace('仕入額','')
df&#91;2]= df&#91;2].str.replace(',','')
df&#91;2]= df&#91;2].str.replace('¥','')
df&#91;2]= df&#91;2].str.replace('/','')
#df&#91;2]= df&#91;2].str.replace(r'\D+','')
#df&#91;2]= df&#91;2].str.replace(r'&#91;^0-9]+','')
print(df&#91;8:9]&#91;2])
import numpy as np
#空白セルを一旦npnに置き換え numpyライブラリで、NaNと一旦置き換え
df&#91;2].replace('',np.nan,inplace=True)
print(df&#91;8:9]&#91;2])

#NaNのセル行ごと削除
df.dropna(subset=&#91;2],inplace=True)
print(df&#91;8:9]&#91;2])
pd.set_option("display.max_rows", 50)
print(df&#91;8:9])
print(df&#91;8:9]&#91;2])
print(df&#91;2])
df&#91;2] =df&#91;2].astype(int)
df.dtypes
pd.set_option("display.max_rows", 500)
print(df&#91;2])
df_sum = df&#91;2].sum()
df_sum
workbook.add_worksheet(title='うんち2',rows=100,cols=100)
print(df_sum)

asin = input()
print(asin)

deltaUrl ='https://delta-tracer.com/item/detail/jp/'+asin
print(deltaUrl)

response = req.urlopen(deltaUrl)

parse_html = BeautifulSoup(response,'html.parser')
print(parse_html.title.string)
print(parse_html.find_all('a'))
print(parse_html.select('.item_img-large'))
url = 'https://delta-tracer.com/item/detail/jp/'+asin
print(url)
response = requests.get(url)
print(response.text)
html = lxml.html.fromstring(response.content)
print(html)
htmlpn = html.xpath("/html/body/section/div&#91;2]/div&#91;2]/div&#91;2]/div&#91;1]/div/table/tbody/tr/td&#91;2]/div/a&#91;1]/strong")
print(htmlpn&#91;0].text)
htmlasin = html.xpath("//*&#91;@class='selectable']")
print("ASIN: "+htmlasin&#91;0].value)
htmlasin = html.xpath("//*&#91;@class='selectable']")
print("JAN: "+htmlasin&#91;1].value)
htmlogata = html.xpath("/html/body/section/div&#91;2]/div&#91;2]/div&#91;2]/div&#91;1]/div/table/tbody/tr/td&#91;2]/div/span&#91;1]/span&#91;1]")
print("大型商品: "+htmlogata&#91;0].text)
html1 = html.xpath("/html/body/section/div&#91;2]/div&#91;2]/div&#91;2]/div&#91;1]/div/div&#91;2]/table/tbody&#91;2]/tr&#91;1]/td&#91;4]/span")
print(html1)
for item1 in html1:
    print(item1.text()) 
    print(item1.text_content()) 
htmlrank = html.xpath("/html/body/section/div&#91;2]/div&#91;2]/div&#91;2]/div&#91;1]/div/table/tbody/tr/td&#91;2]/div/span&#91;2]/span/strong")
print('ランキング: '+htmlrank&#91;0].text+"位")
htmlsho = html.xpath("//*&#91;@class='text-right']")
print('出品数: '+htmlsho&#91;5].text_content()+"\n")
Nseller = htmlsho&#91;5].text_content()
#print(htmlsho)
#for item in htmlsho:
#    print(item.text_content())    
#text = td.text()
#text = td.text_content()
Nseller = re.sub("\\D", "", Nseller)
print('出品数: '+Nseller+"件\n")
#reモジュールによって文字列から数字のみを抜き出した
#一つ前の行だけだと、文字列と数字と空白が合体してしまってる
parse_lists=parse_html.find_all('strong')
parse_lists&#91;1:100]
parse_lists&#91;7].string
strong_list=&#91;]
for i in parse_lists:
    strong_list.append(i.string)
strong_list
#データフレーム 作り
df_strong_list = pd.DataFrame({'タグのみ抜き出した時':parse_lists,'文字列のみに変換した時':strong_list})
df_strong_list
#行に1つでも欠損値（NaN）があればその行消す。anyが1つでも、allが行全て
df_notnull = df_strong_list.dropna(how='any')
df_notnull
#特定の文字列を含むか判定str.contains  \dで数字全てってことの正規表現。要は、数字入ってるとこだけ抽出
df_notnull&#91;'文字列のみに変換した時'].str.contains('\d')
#上の判定結果を&#91;]の中に入れると、Trueの行だけ表示される
df_notnull&#91;df_notnull&#91;'文字列のみに変換した時'].str.contains('\d')]
#書き出しように、変数に代入
df_contain_python =df_notnull&#91;df_notnull&#91;'文字列のみに変換した時'].str.contains('\d')]
set_with_dataframe(workbook.worksheet('うんち2'), df_contain_python, include_index=True)

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

from bs4 import BeautifulSoup

import urllib.request as req

import requests

import lxml.html

import re

import pandas as pd

import gspread

from gspread_dataframe import set_with_dataframe

from oauth2client.service_account import ServiceAccountCredentials

#GoogleからもらったJSONファイルで、スコープのAPIが使えるように認証情報を取得

SCOPES = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']

SERVICE_ACCOUNT_FILE ='秘密鍵情報はgoogleAPIから取得しよう！.json'

credentials = ServiceAccountCredentials.from_json_keyfile_name(SERVICE_ACCOUNT_FILE, SCOPES)

#認証情報を使って、スプレッドシートの操作券を取得

gs = gspread.authorize(credentials)

#共有したスプレッドシートのキー(スプレッドシートのURLの真ん中ら辺)を使ってシートの情報を取得

SPREADSHEET_KEY = '書き込みたいスプレッドシートのURL'

worksheet = gs.open_by_key(SPREADSHEET_KEY).worksheet('書き込みたいシートの名前を入れる')

#値がとれてるかとりあえずプリントしてみる

print(worksheet.acell('R9').value)

workbook = gs.open_by_key(SPREADSHEET_KEY)

worksheet = workbook.worksheet('12')

print(workbook.title)

print(workbook.id)

print(worksheet)

df = pd.DataFrame(worksheet.get_all_values())

df.head()

#行と列の数を取り出す

sh=df.shape

#shのprint表示

print(sh)

print(sh[1])

#shapeの列の数だけrangeで振り直してcolumnsに代入

df.columns=range(sh[1])

df.head()

df.dtypes

#ここで数字以外は空白にする　　空白は次のreplace,dropnaで消す

df[2]= df[2].str.replace('仕入額','')

df[2]= df[2].str.replace(',','')

df[2]= df[2].str.replace('¥','')

df[2]= df[2].str.replace('/','')

#df[2]= df[2].str.replace(r'\D+','')

#df[2]= df[2].str.replace(r'[^0-9]+','')

print(df[8:9][2])

import numpy as np

#空白セルを一旦npnに置き換え numpyライブラリで、NaNと一旦置き換え

df[2].replace('',np.nan,inplace=True)

print(df[8:9][2])

#NaNのセル行ごと削除

df.dropna(subset=[2],inplace=True)

print(df[8:9][2])

pd.set_option("display.max_rows", 50)

print(df[8:9])

print(df[8:9][2])

print(df[2])

df[2] =df[2].astype(int)

df.dtypes

pd.set_option("display.max_rows", 500)

print(df[2])

df_sum = df[2].sum()

df_sum

workbook.add_worksheet(title='うんち2',rows=100,cols=100)

print(df_sum)

asin = input()

print(asin)

deltaUrl ='https://delta-tracer.com/item/detail/jp/'+asin

print(deltaUrl)

response = req.urlopen(deltaUrl)

parse_html = BeautifulSoup(response,'html.parser')

print(parse_html.title.string)

print(parse_html.find_all('a'))

print(parse_html.select('.item_img-large'))

url = 'https://delta-tracer.com/item/detail/jp/'+asin

print(url)

response = requests.get(url)

print(response.text)

html = lxml.html.fromstring(response.content)

print(html)

htmlpn = html.xpath("/html/body/section/div[2]/div[2]/div[2]/div[1]/div/table/tbody/tr/td[2]/div/a[1]/strong")

print(htmlpn[0].text)

htmlasin = html.xpath("//*[@class='selectable']")

print("ASIN: "+htmlasin[0].value)

htmlasin = html.xpath("//*[@class='selectable']")

print("JAN: "+htmlasin[1].value)

htmlogata = html.xpath("/html/body/section/div[2]/div[2]/div[2]/div[1]/div/table/tbody/tr/td[2]/div/span[1]/span[1]")

print("大型商品: "+htmlogata[0].text)

html1 = html.xpath("/html/body/section/div[2]/div[2]/div[2]/div[1]/div/div[2]/table/tbody[2]/tr[1]/td[4]/span")

print(html1)

for item1 in html1:

print(item1.text())

print(item1.text_content())

htmlrank = html.xpath("/html/body/section/div[2]/div[2]/div[2]/div[1]/div/table/tbody/tr/td[2]/div/span[2]/span/strong")

print('ランキング: '+htmlrank[0].text+"位")

htmlsho = html.xpath("//*[@class='text-right']")

print('出品数: '+htmlsho[5].text_content()+"\n")

Nseller = htmlsho[5].text_content()

#print(htmlsho)

#for item in htmlsho:

# print(item.text_content())

#text = td.text()

#text = td.text_content()

Nseller = re.sub("\\D", "", Nseller)

print('出品数: '+Nseller+"件\n")

#reモジュールによって文字列から数字のみを抜き出した

#一つ前の行だけだと、文字列と数字と空白が合体してしまってる

parse_lists=parse_html.find_all('strong')

parse_lists[1:100]

parse_lists[7].string

strong_list=[]

for i in parse_lists:

strong_list.append(i.string)

strong_list

#データフレーム作り

df_strong_list = pd.DataFrame({'タグのみ抜き出した時':parse_lists,'文字列のみに変換した時':strong_list})

df_strong_list

#行に1つでも欠損値（NaN）があればその行消す。anyが1つでも、allが行全て

df_notnull = df_strong_list.dropna(how='any')

df_notnull

#特定の文字列を含むか判定str.contains \dで数字全てってことの正規表現。要は、数字入ってるとこだけ抽出

df_notnull['文字列のみに変換した時'].str.contains('\d')

#上の判定結果を[]の中に入れると、Trueの行だけ表示される

df_notnull[df_notnull['文字列のみに変換した時'].str.contains('\d')]

#書き出しように、変数に代入

df_contain_python =df_notnull[df_notnull['文字列のみに変換した時'].str.contains('\d')]

set_with_dataframe(workbook.worksheet('うんち2'), df_contain_python, include_index=True)

Google APIでMyProjectを作って、スプレッドシート操作用のキーを取得する

ここにアクセス

ライブラリのインストール

pipコマンドで以下をインストール

BeautifulSoup
urllib.request
requests
lxml
re
pandas
gspread
gspread_dataframe
oauth2client.service_account

入れたかわかんなくなったら、
pip show ***** **にはライブラリ名を入れる

今日はここまでにして、続きは後日書きます。以下の画像は動いてる姿。

comment コメントをキャンセル

このサイトはスパムを低減するために Akismet を使っています。コメントデータの処理方法の詳細はこちらをご覧ください。

: Mac＆スマホプログラミング未分類

【android studio】ファイルタブのファイル名文字色が赤く変わってしまったときの対処法【GIT】

android studio使っていて、ファイルタブのファイル名の文字色が突然赤くなってしまったことありませんか？もしくはgithubインストールしたタイミングで、android studioのファイ ...

: プログラミング

【JavaScript】Web APIを使えるようになる

目次 {JSON} PlaceholderのAPIを使ってみるAPIを呼び出すjavascriptコンソールで確認応用編　Jsonデータをクリックするごとに増やしていく {JSON} Placehol ...

: Mac＆スマホプログラミング

【Macで図解】GoogleSpreadsheetAPIを使って、Androidアプリから、スプレッドシートの値を書き換え【AndroidStudio】

AndroidアプリからGoogleのスプレッドシート APIを用いて、スプレッドシートに書き込んだデータを読み込んだり、書き込んだりってことを実現できるようにAndroid Studioを使って、 ...

: プログラミング

【android studio】webAPIでget/postするときのエラー Cleartext communication to 10.0.2.2 not permitted by network security policy が出たときの対処法

目次 androidはhttp通信が許可されていない環境原因対策 androidはhttp通信が許可されていない webAPIをpost/getするアンドロイドアプリを実装したのですが、 [crayo ...

: プログラミング

【JavaScript】AmazonページからASINだけ抽出するスクリプト

とりあえず、Amazonのホームページから、ASINだけを抜き出すのは以下のコード getElementsByClassName()でクラス名からHTML要素を複数取得する方法がわかればできますね目 ...

PREV: 【JavaScript】モジュール化　constで変数被るとダメだから・・
NEXT: 【実測】MacBook Pro 2020年モデルのバッテリーの持ち　稼働時間　MacBookPro16,2

【python】Amazon商品をwebスクレイピングしてみた。GoogleスプレッドシートにASINなど取得して書き込みできるよ

とりあえず、出来上がったコードを張っておきます。（動作確認レベルで、リファクタリングすらしてない）

Google APIでMyProjectを作って、スプレッドシート操作用のキーを取得する

ライブラリのインストール

Amazon Echo Dot第３世代が500円で買える！【激安キャンペーン情報】

【4KテレビをPCモニター化】ハイセンス43F68EとTCL 43K601Uとアイリスオーヤマ43UB10P 比較【３万円台】

【2020年】キャンペーンを駆使して最安運用【格安SIM】【MVNO】

【python】Amazon商品をwebスクレイピングしてみた。Googleスプレッドシート にASINなど取得して書き込みできるよ

とりあえず、出来上がったコードを張っておきます。（動作確認レベルで、リファクタリングすらしてない）

Google APIでMyProjectを作って、スプレッドシート操作用のキーを取得する

ライブラリのインストール

【python】Amazon商品をwebスクレイピングしてみた。GoogleスプレッドシートにASINなど取得して書き込みできるよ