from urllib import request
from bs4 import BeautifulSoup
import pymysql
import time
from selenium import webdriver
# データベースへの接続を作成し、ここでcharsetとcursorclassパラメータを追加したことに注意する。
conn = pymysql.connect(
host=".1",
user="root",
password="",
database="spider",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
# カーソルを取得する
cursor = conn.cursor()
def office365_spider():
url1 = "https://..com/en-us/office365/enterprise/office-365-u-s-government-gcc-high-endpoints"
url2 = "https://..com/en-us/office365/enterprise/office-365-u-s-government-dod-endpoints"
url3 = "https://..com/en-us/office365/enterprise/urls-and-ip-address-ranges"
url4 = "https://..com/en-us/office365/enterprise/urls-and-ip-address-ranges-21vianet"
url5 = "https://..com/en-us/office365/enterprise/office-365-germany-endpoints"
url = [url1, url2, url3, url4, url5]
# print(url)
for case in url:
# print(case)
html = request.urlopen(case).read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
# テーブルの各行を取得する
select = soup.find_all('td')
# for s in select:
# print(s)
a = []
for i in range(len(select)):
if i == 0 or i == 1:
continue
elif case == url3 and i == 2:
continue
else:
a.append(select[i].text)
b = [a[i:i + 5] for i in range(0, len(a), 5)]
# company IP Port Protocol
try:
for item in b:
# print(item)
if len(item) != 5:
continue
item[3] = item[3].replace(",", ",")
ip = item[3].split(",")
item[4] = item[4].replace(":", ":")
protocol_and_port = item[4].split(":")
if len(protocol_and_port) < 2:
continue
protocol_tcp = protocol_and_port[0]
port_tcp = protocol_and_port[1]
protocol_udp = ""
port_udp = ""
if port_tcp.find("UDP") != -1:
protocol_udp = "UDP"
port_tcp = port_tcp.replace("UDP", "")
port_udp = protocol_and_port[2]
for i in ip:
if i.find("us") != -1 and i.find("/") != -1:
continue
if i.find(".com") != -1 and i.find("/") != -1:
continue
if len(protocol_udp) != 0:
print("OFFICE365-" + str(i) + "-" + protocol_udp + "-" + port_udp)
cursor.execute(
"INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
('OFFICE365', i, protocol_udp, port_udp))
print("OFFICE365-" + str(i) + "-" + protocol_tcp + "-" + port_tcp)
cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
('OFFICE365', i, protocol_tcp, port_tcp))
conn.commit()
except Exception as e:
print(e)
def zoom_spider():
url = "https://..us/hc/en-us/articles/201362683-Network-firewall-or-proxy-server-settings-for-Zoom"
print(url)
html = request.urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
# テーブルの各行を取得する
select = soup.find_all('td')
index = [4, 8, 12, 20, 24, 28, 32, 40]
a = []
for i in range(len(select)):
if not (i in index):
continue
else:
for k in range(0, 4):
a.append(select[i].text)
i += 1
b = [a[i:i + 4] for i in range(0, len(a), 4)]
# company IP Port Protocol
k = 0
for item in b:
if len(item) != 4:
continue
k += 1
# print(item)
item[1] = item[1].replace("
", "").replace("\xa0-", ",").replace(" (see note)", "")
item[3] = item[3].replace("
", "").replace("IPv4:", " ").replace("IPv6:", " ")
if k == 5:
item[3] = item[3].replace("32", "32 ")
protocol = item[0]
port = item[1]
ip_list = item[3].split(" ")
# print(ip_list)
# print(item)
# print("--------------------------------------")
try:
for ip in ip_list:
if len(ip) == 0:
continue
if ip.count("/") > 1:
continue
print("ZOOM-" + ip + "-" + protocol + "-" + port)
cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
('ZOOM', ip, protocol, port))
conn.commit()
except Exception as e:
print(e)
def salesforce_spider():
url = "https://..com/articleView?id=000321501&type=1&mode=1"
# インターフェイスレス動作のパラメーター
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--window-size=1420,1080')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
driver.implicitly_wait(10)
time.sleep(2)
row = driver.find_elements_by_tag_name('tr')
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
select = soup.find_all('td')
index = 0
try:
for s in select:
index += 1
if index < 5 or index > 374:
continue
if s.text.find(".") < 3:
continue
ip = s.text
cursor.execute("INSERT INTO application_spider(application,ip,protocol,port) VALUES(%s,%s,%s,%s);",
('SALESFORCE', ip, "TCP/UDP", "ANY"))
conn.commit()
except Exception as e:
print(e)
driver.quit()
def main():
# sprider 3 application
office365_spider()
zoom_spider()
salesforce_spider()
# close cursor and connection
cursor.close()
conn.close()
if __name__ == '__main__':
main()