本文整理自我的 Colab 笔记:保留核心文字与可复制的代码块,去除冗余输出与报错,适配 Hexo 博客。
import os
here = os.getcwd()
here = os.path.dirname(os.getcwd())
here
url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a"
print(url)
from sklearn.datasets import load_svmlight_file
Xtr, ytr = load_svmlight_file(url)
import requests
url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a"
filename = "a9a.txt"
response = requests.get(url)
with open(filename, "wb") as f:
f.write(response.content)
print(f"Downloaded {filename}")
from sklearn.datasets import load_svmlight_file
filename = "a9a.txt"
Xtr, ytr = load_svmlight_file(filename)
print("Data loaded successfully!")
print("Shape of Xtr:", Xtr.shape)
print("Shape of ytr:", ytr.shape)
import os, sys, json
from urllib.parse import urlparse
from urllib.request import Request, urlopen
def hdr():
h = {"User-Agent": "url-grabber/2.0", "Accept": "*/*"}
tok = os.getenv("GITHUB_TOKEN")
if tok:
h["Authorization"] = f"Bearer {tok}"
h["Accept"] = "application/vnd.github+json"
return h
def http_get(url: str) -> bytes:
with urlopen(Request(url, headers=hdr()), timeout=60) as r:
return r.read()
url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/floor_info.json")
url
def http_get(url: str) -> bytes:
with urlopen(Request(url), timeout=60) as r:
return r.read()
url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/floor_info.json")
url
arr = json.loads(url.decode("utf-8", "ignore"))
arr
def http_get(url: str) -> bytes:
parsed_url = urlparse(url)
raw_url = f"https://raw.githubusercontent.com{parsed_url.path.replace('/blob/', '/')}"
with urlopen(Request(raw_url, headers=hdr()), timeout=60) as r:
return r.read()
url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/geojson_map.json")
url
arr = json.loads(url.decode("utf-8", "ignore"))
arr
def http_get(url: str) -> bytes:
parsed_url = urlparse(url)
raw_url = f"https://raw.githubusercontent.com{parsed_url.path.replace('/blob/', '/')}"
with urlopen(Request(raw_url, headers=hdr()), timeout=60) as r:
return r.read()
url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/path_data_files/5dd9e7aac5b77e0006b1732b.txt")
url
arr = json.loads(url.decode("utf-8", "ignore"))
arr
p = urlparse(url)
p
p[2]
def parse_tree(url: str):
"""
https://github.com/<owner>/<repo>/tree/<branch>/<subpath...>
-> owner, repo, branch, subpath
"""
p = urlparse(url)
parts = [x for x in p.path.strip("/").split("/") if x]
if len(parts) < 4 or parts[2] != "tree":
raise ValueError("不是 /tree/ 目录链接")
owner, repo, branch = parts[0], parts[1], parts[3]
subpath = "/".join(parts[4:]) if len(parts) > 4 else ""
return owner, repo, branch, subpath
url = "https://github.com/location-competition/indoor-location-competition-20/tree/master/data/site1/B1/path_data_files"
def http_get(url: str) -> bytes:
parsed_url = urlparse(url)
raw_url = f"https://raw.githubusercontent.com{parsed_url.path.replace('/blob/', '/')}"
with urlopen(Request(raw_url, headers=hdr()), timeout=60) as r:
return r.read()
owner, repo, branch, subpath = parse_tree(url)
owner, repo, branch, subpath
def list_dir_via_api(owner: str, repo: str, branch: str, subpath: str):
api = f"https://api.github.com/repos/{owner}/{repo}/contents/{subpath}?ref={branch}"
data = http_get(api)
arr = json.loads(data.decode("utf-8", "ignore"))
if isinstance(arr, dict) and arr.get("type") == "file":
return [arr]
return [it for it in arr if it.get("type") == "file"]
items = list_dir_via_api(owner, repo, branch, subpath)
items
from urllib.parse import urlparse, quote
from urllib.request import Request, urlopen
def to_raw_github(url: str) -> str:
if url.startswith("https://raw.githubusercontent.com/"):
return url
if "github.com" in url and "/blob/" in url:
p = urlparse(url)
parts = [x for x in p.path.split("/") if x]
if len(parts) >= 5 and parts[2] == "blob":
owner, repo, _, branch, *rest = parts
rest_enc = "/".join(quote(seg) for seg in rest)
return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{rest_enc}"
return url
def http_get(url: str, timeout: int = 60) -> bytes:
raw = to_raw_github(url)
req = Request(raw, headers={"User-Agent": "mini-downloader/1.0"})
with urlopen(req, timeout=timeout) as r:
return r.read()
url = "https://github.com/location-competition/indoor-location-competition-20/tree/master/data/site1/F2"
r = http_get(url)
r
arr = json.loads(r.decode("utf-8", "ignore"))
arr
def list_tree_recursive(owner, repo, branch, prefix=""):
api = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
with urlopen(Request(api)) as r:
data = json.loads(r.read().decode("utf-8","ignore"))
out = []
for node in data.get("tree", []):
if node.get("type")=="blob":
path = node["path"]
if not prefix or path==prefix or path.startswith(prefix.rstrip("/")+"/"):
out.append(path)
return out
def parse_tree(url: str):
p = urlparse(url)
parts = [x for x in p.path.strip("/").split("/") if x]
assert len(parts)>=4 and parts[2]=="tree", "不是 /tree/ 目录链接"
owner, repo, branch = parts[0], parts[1], parts[3]
subpath = "/".join(parts[4:]) if len(parts)>4 else ""
return owner, repo, branch, subpath
def to_raw_from_parts(owner, repo, branch, path):
return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
owner, repo, branch, subpath = parse_tree(url)
owner, repo, branch, subpath
items = list_tree_recursive(owner, repo, branch, subpath)
items
api = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
with urlopen(Request(api)) as r:
data = json.loads(r.read().decode("utf-8","ignore"))
data
out = []
prefix = subpath
for node in data.get("tree", []):
print(node)
if node.get("type")=="blob":
path = node["path"]
if not prefix or path==prefix or path.startswith(prefix.rstrip("/")+"/"):
out.append(path)
out
由脚本自动从 Colab .ipynb 整理为 Hexo 友好 Markdown。资源(若有)位于 md_assets/。