2025-09-26

Colab 学习笔记

本文整理自我的 Colab 笔记：保留核心文字与可复制的代码块，去除冗余输出与报错，适配 Hexo 博客。

import os

here = os.getcwd()

here = os.path.dirname(os.getcwd())
here

url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a"
print(url)

from sklearn.datasets import load_svmlight_file

Xtr, ytr = load_svmlight_file(url)

import requests

url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a"
filename = "a9a.txt"

response = requests.get(url)
with open(filename, "wb") as f:
    f.write(response.content)

print(f"Downloaded {filename}")

from sklearn.datasets import load_svmlight_file

filename = "a9a.txt"
Xtr, ytr = load_svmlight_file(filename)

print("Data loaded successfully!")
print("Shape of Xtr:", Xtr.shape)
print("Shape of ytr:", ytr.shape)

import os, sys, json
from urllib.parse import urlparse
from urllib.request import Request, urlopen

def hdr():
    h = {"User-Agent": "url-grabber/2.0", "Accept": "*/*"}
    tok = os.getenv("GITHUB_TOKEN")
    if tok:
        h["Authorization"] = f"Bearer {tok}"
        h["Accept"] = "application/vnd.github+json"
    return h
def http_get(url: str) -> bytes:
  with urlopen(Request(url, headers=hdr()), timeout=60) as r:
      return r.read()

url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/floor_info.json")
url

def http_get(url: str) -> bytes:
    with urlopen(Request(url), timeout=60) as r:
        return r.read()

url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/floor_info.json")
url

arr = json.loads(url.decode("utf-8", "ignore"))
arr

def http_get(url: str) -> bytes:
    # Construct the raw URL for GitHub
    parsed_url = urlparse(url)
    raw_url = f"https://raw.githubusercontent.com{parsed_url.path.replace('/blob/', '/')}"
    with urlopen(Request(raw_url, headers=hdr()), timeout=60) as r:
        return r.read()

url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/geojson_map.json")
url

arr = json.loads(url.decode("utf-8", "ignore"))
arr

def http_get(url: str) -> bytes:
    # Construct the raw URL for GitHub
    parsed_url = urlparse(url)
    raw_url = f"https://raw.githubusercontent.com{parsed_url.path.replace('/blob/', '/')}"
    with urlopen(Request(raw_url, headers=hdr()), timeout=60) as r:
        return r.read()

url = http_get("https://github.com/location-competition/indoor-location-competition-20/blob/master/data/site1/F1/path_data_files/5dd9e7aac5b77e0006b1732b.txt")
url

arr = json.loads(url.decode("utf-8", "ignore"))
arr

p = urlparse(url)
p
p[2]

def parse_tree(url: str):
    """
    https://github.com/<owner>/<repo>/tree/<branch>/<subpath...>
    -> owner, repo, branch, subpath
    """
    p = urlparse(url)
    parts = [x for x in p.path.strip("/").split("/") if x]
    if len(parts) < 4 or parts[2] != "tree":
        raise ValueError("不是 /tree/ 目录链接")
    owner, repo, branch = parts[0], parts[1], parts[3]
    subpath = "/".join(parts[4:]) if len(parts) > 4 else ""
    return owner, repo, branch, subpath

url = "https://github.com/location-competition/indoor-location-competition-20/tree/master/data/site1/B1/path_data_files"
def http_get(url: str) -> bytes:
    # Construct the raw URL for GitHub
    parsed_url = urlparse(url)
    raw_url = f"https://raw.githubusercontent.com{parsed_url.path.replace('/blob/', '/')}"
    with urlopen(Request(raw_url, headers=hdr()), timeout=60) as r:
        return r.read()

owner, repo, branch, subpath = parse_tree(url)

owner, repo, branch, subpath

def list_dir_via_api(owner: str, repo: str, branch: str, subpath: str):
    api = f"https://api.github.com/repos/{owner}/{repo}/contents/{subpath}?ref={branch}"
    data = http_get(api)
    arr = json.loads(data.decode("utf-8", "ignore"))
    if isinstance(arr, dict) and arr.get("type") == "file":
        return [arr]
    # 只列当前目录（不递归）
    return [it for it in arr if it.get("type") == "file"]

items = list_dir_via_api(owner, repo, branch, subpath)
items

from urllib.parse import urlparse, quote
from urllib.request import Request, urlopen

def to_raw_github(url: str) -> str:
    if url.startswith("https://raw.githubusercontent.com/"):
        return url
    if "github.com" in url and "/blob/" in url:
        p = urlparse(url)
        parts = [x for x in p.path.split("/") if x]
        # /owner/repo/blob/branch/path/to/file
        if len(parts) >= 5 and parts[2] == "blob":
            owner, repo, _, branch, *rest = parts
            rest_enc = "/".join(quote(seg) for seg in rest)
            return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{rest_enc}"
    return url  # 其他链接原样返回（或按需抛错）

def http_get(url: str, timeout: int = 60) -> bytes:
    raw = to_raw_github(url)
    req = Request(raw, headers={"User-Agent": "mini-downloader/1.0"})
    with urlopen(req, timeout=timeout) as r:
        return r.read()

# 用法：
# data = http_get("https://github.com/.../blob/.../file.txt")
# with open("file.txt","wb") as f: f.write(data)

url = "https://github.com/location-competition/indoor-location-competition-20/tree/master/data/site1/F2"

r = http_get(url)
r

arr = json.loads(r.decode("utf-8", "ignore"))
arr

def list_tree_recursive(owner, repo, branch, prefix=""):
    api = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
    with urlopen(Request(api)) as r:
        data = json.loads(r.read().decode("utf-8","ignore"))
    out = []
    for node in data.get("tree", []):
        if node.get("type")=="blob":
            path = node["path"]
            if not prefix or path==prefix or path.startswith(prefix.rstrip("/")+"/"):
                out.append(path)
    return out
def parse_tree(url: str):
    p = urlparse(url)
    parts = [x for x in p.path.strip("/").split("/") if x]
    assert len(parts)>=4 and parts[2]=="tree", "不是 /tree/ 目录链接"
    owner, repo, branch = parts[0], parts[1], parts[3]
    subpath = "/".join(parts[4:]) if len(parts)>4 else ""
    return owner, repo, branch, subpath
def to_raw_from_parts(owner, repo, branch, path):
    return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"

# 用法：
# paths = list_tree_recursive(owner,repo,branch,"data/site1/B1/path_data_files")
# for p in paths:
#     raw = to_raw_from_parts(owner,repo,branch,p)
#     ...

owner, repo, branch, subpath = parse_tree(url)
owner, repo, branch, subpath

items = list_tree_recursive(owner, repo, branch, subpath)

items

api = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
with urlopen(Request(api)) as r:
    data = json.loads(r.read().decode("utf-8","ignore"))

data

out = []
prefix = subpath
for node in data.get("tree", []):
    print(node)
    if node.get("type")=="blob":
        path = node["path"]
        if not prefix or path==prefix or path.startswith(prefix.rstrip("/")+"/"):
            out.append(path)

out

由脚本自动从 Colab .ipynb 整理为 Hexo 友好 Markdown。资源（若有）位于 md_assets/。

本文阅读：--

Zhiming Liang Blog

Notes & Life

Colab 学习笔记