# Chrome浏览历史记录分析与可视化工具：洞察你的数字足迹

背景介绍

在数字时代，我们的浏览历史记录就像一本无声的日记，记录着我们的兴趣、工作习惯和时间分配。了解自己的浏览模式不仅能帮助优化时间管理，还能发现自己的兴趣所在和可能的效率瓶颈。今天，我将分享一个实用的Python工具，它能帮助你分析Chrome浏览器的历史记录，并通过可视化图表直观展示你的浏览习惯。

这个工具具有以下特点：
– 完全本地运行：无需上传任何数据到云端，保护隐私
– 跨平台支持：兼容Windows、macOS和Linux
– 多维度分析：提供网站访问频率、时段分布和时间趋势等多个维度的分析
– 直观可视化：生成清晰的图表帮助理解数据

思路分析

要实现这个工具，我们需要完成以下几个关键步骤：

跨平台路径检测：根据操作系统自动定位Chrome历史记录文件
数据提取与清洗：从SQLite数据库读取数据并进行预处理
多维度统计分析：计算访问频率、时段分布和时间趋势
可视化展示：使用Matplotlib生成多种图表
结果输出：在控制台显示关键统计信息并保存图表文件

接下来，让我们一步步实现这个工具。

代码实现

首先，确保你已经安装了所需的依赖库：

pip install pandas matplotlib

完整代码实现

import os
import sqlite3
import datetime
import platform
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei']
plt.rcParams['axes.unicode_minus'] = False

def get_chrome_history_path():
    """根据操作系统获取Chrome历史记录默认路径"""
    os_type = platform.system()

    if os_type == 'Windows':
        # Windows路径
        return os.path.join(os.environ['USERPROFILE'], 
                           'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'Default', 'History')
    elif os_type == 'Darwin':  # macOS
        return os.path.join(os.path.expanduser('~'), 
                           'Library', 'Application Support', 'Google', 'Chrome', 'Default', 'History')
    elif os_type == 'Linux':
        return os.path.join(os.path.expanduser('~'), 
                           '.config', 'google-chrome', 'Default', 'History')
    else:
        raise OSError(f"不支持的操作系统: {os_type}")

def read_chrome_history(db_path):
    """读取Chrome历史记录数据库并返回处理后的DataFrame"""
    # 连接到SQLite数据库
    conn = sqlite3.connect(db_path)

    # 查询urls和visits表，关联获取访问记录
    query = """
    SELECT 
        urls.url, 
        urls.title, 
        visits.visit_time, 
        COUNT(visits.id) as visit_count
    FROM urls 
    JOIN visits ON urls.id = visits.url 
    GROUP BY urls.id 
    ORDER BY visit_count DESC
    """

    df = pd.read_sql_query(query, conn)
    conn.close()

    # 转换时间格式（Chrome使用的是微秒级的时间戳，从1601-01-01开始）
    df['visit_time'] = pd.to_datetime(df['visit_time'], unit='us', origin='1601-01-01')

    # 提取域名作为网站名称
    df['domain'] = df['url'].str.extract(r'https?://([^/]+)')

    # 处理缺失的标题
    df['title'] = df['title'].fillna('无标题')

    return df

def analyze_data(df):
    """分析数据并返回统计结果"""
    # 1. 最常访问的TOP10网站
    top_sites = df.groupby('domain')['visit_count'].sum().sort_values(ascending=False).head(10)

    # 2. 每日各时段的访问频率分布
    df['hour'] = df['visit_time'].dt.hour
    hourly_distribution = df.groupby('hour')['visit_count'].sum()

    # 3. 最近7天的每日访问总量趋势
    df['date'] = df['visit_time'].dt.date
    recent_7_days = df[df['date'] >= (datetime.date.today() - datetime.timedelta(days=7))]
    daily_trend = recent_7_days.groupby('date')['visit_count'].sum()

    # 4. 总体统计信息
    total_records = df['visit_count'].sum()
    time_range = (df['visit_time'].min().strftime('%Y-%m-%d'), df['visit_time'].max().strftime('%Y-%m-%d'))
    avg_daily_visits = total_records / ((pd.to_datetime(time_range[1]) - pd.to_datetime(time_range[0])).days + 1)

    # 5. 高峰时段
    peak_hour = hourly_distribution.idxmax()
    peak_percentage = (hourly_distribution.max() / total_records) * 100

    return {
        'top_sites': top_sites,
        'hourly_distribution': hourly_distribution,
        'daily_trend': daily_trend,
        'total_records': total_records,
        'time_range': time_range,
        'avg_daily_visits': avg_daily_visits,
        'peak_hour': peak_hour,
        'peak_percentage': peak_percentage
    }

def visualize_data(analysis_result):
    """生成并保存可视化图表"""
    # 1. TOP10网站柱状图
    plt.figure(figsize=(10, 6))
    analysis_result['top_sites'].plot(kind='bar')
    plt.title('最常访问的TOP10网站')
    plt.xlabel('网站域名')
    plt.ylabel('访问次数')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('top_sites.png', dpi=300)
    plt.close()

    # 2. 时段分布热力图
    plt.figure(figsize=(12, 6))
    # 补全0-23小时的数据
    hourly_data = pd.Series(0, index=range(24))
    hourly_data.update(analysis_result['hourly_distribution'])
    plt.bar(range(24), hourly_data)
    plt.title('24小时访问频率分布')
    plt.xlabel('小时')
    plt.ylabel('访问次数')
    plt.xticks(range(24))
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('hourly_distribution.png', dpi=300)
    plt.close()

    # 3. 最近7天趋势图
    plt.figure(figsize=(10, 6))
    analysis_result['daily_trend'].plot(kind='line', marker='o')
    plt.title('最近7天访问趋势')
    plt.xlabel('日期')
    plt.ylabel('访问次数')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('daily_trend.png', dpi=300)
    plt.close()

def print_summary(analysis_result):
    """在控制台输出统计摘要"""
    print("=== Chrome历史记录分析结果 ===")
    print(f"总访问记录数：{int(analysis_result['total_records'])}条")
    print(f"覆盖时间范围：{analysis_result['time_range'][0]} 至 {analysis_result['time_range'][1]}")
    print(f"日均访问次数：{int(analysis_result['avg_daily_visits'])}次\n")

    print("最常访问TOP10网站：")
    for i, (site, count) in enumerate(analysis_result['top_sites'].items(), 1):
        print(f"{i}. {site} ({int(count)}次)")

    print(f"\n每日访问高峰时段：{analysis_result['peak_hour']}点（占总访问量的{analysis_result['peak_percentage']:.1f}%）")

    if not analysis_result['daily_trend'].empty:
        max_day = analysis_result['daily_trend'].idxmax()
        max_count = analysis_result['daily_trend'].max()
        print(f"最近7天访问趋势：{max_day}访问量最高（{int(max_count)}次）")

def main():
    try:
        # 尝试自动获取Chrome历史记录路径  
        db_path = get_chrome_history_path()

        # 如果路径不存在或无法访问，提示用户手动输入
        if not os.path.exists(db_path):
            raise FileNotFoundError(f"无法找到Chrome历史记录文件：{db_path}")

        print(f"找到Chrome历史记录文件：{db_path}")

    except Exception as e:
        print(f"自动检测失败：{e}")
        db_path = input("请手动输入Chrome历史记录文件路径：")

    try:
        # 读取和处理数据
        df = read_chrome_history(db_path)

        # 分析数据
        analysis_result = analyze_data(df)

        # 生成可视化图表
        visualize_data(analysis_result)

        # 输出统计摘要
        print_summary(analysis_result)

        print("\n可视化图表已保存到当前目录：")
        print("- top_sites.png (TOP10网站柱状图)")
        print("- hourly_distribution.png (时段分布热力图)")
        print("- daily_trend.png (最近7天趋势图)")

    except sqlite3.OperationalError as e:
        if "database is locked" in str(e):
            print("错误：Chrome浏览器正在运行中，请先关闭Chrome再尝试。")
        else:
            print(f"数据库操作错误：{e}")
    except Exception as e:
        print(f"发生错误：{e}")

if __name__ == "__main__":
    main()

代码解释

让我逐一解释关键部分的代码：

跨平台路径检测：
- get_chrome_history_path()函数根据操作系统类型返回不同的路径
- 使用platform.system()判断操作系统类型
- 使用os.environ['USERPROFILE']获取Windows用户目录，os.path.expanduser('~')获取macOS/Linux用户目录
数据读取与清洗：
- read_chrome_history()函数连接SQLite数据库并执行查询
- 使用pd.read_sql_query()读取数据到DataFrame
- 转换Chrome特有的时间格式（从1601-01-01开始的微秒级时间戳）
- 提取域名、处理缺失标题等数据清洗工作
数据分析：
- analyze_data()函数进行多维度分析
- 使用groupby()和sum()进行数据聚合
- 计算TOP10网站、时段分布和每日趋势
可视化：
- visualize_data()函数生成三种不同类型的图表
- 使用Matplotlib的plot()、bar()等方法绘制图表
- 保存图表为PNG文件
结果输出：
- print_summary()函数在控制台输出关键统计信息
- 包括总访问量、时间范围、TOP网站等

总结

这个Chrome浏览历史记录分析工具展示了数据分析的完整流程：从数据读取、清洗、分析到可视化。它不仅实用，能帮助你了解自己的浏览习惯，还具有很好的学习价值，覆盖了SQLite数据库操作、Pandas数据处理和Matplotlib可视化等重要技能。

通过这个工具，你可以：
– 发现自己最常访问的网站
– 了解自己的上网高峰时段
– 观察最近的浏览趋势变化
– 优化自己的时间管理

这个工具完全本地运行，保护你的隐私，而且代码简洁易懂，适合中级以下开发者学习和使用。希望这个工具能帮助你更好地了解自己的数字足迹！
“`

AI管家

# Chrome浏览历史记录分析与可视化工具：洞察你的数字足迹

背景介绍

思路分析

代码实现

完整代码实现

代码解释

总结

发表回复取消回复

# Chrome浏览历史记录分析与可视化工具：洞察你的数字足迹

背景介绍

思路分析

代码实现

完整代码实现

代码解释

总结

发表回复 取消回复

发表回复取消回复