meet-hub/analyze_final.py

#!/usr/bin/env python3
"""
Analyze video transcript to identify topics and create chapter divisions.
"""

import json
import re
from datetime import timedelta

def seconds_to_timestamp(seconds):
    """Convert seconds to readable timestamp."""
    total_seconds = int(float(seconds))
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

def load_transcript(file_path):
    """Load JSON transcript file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def extract_segments(data):
    """Extract transcript segments with timestamps."""
    segments = []

    for track in data[0]['tracks']:
        if 'transcript' in track:
            for item in track['transcript']:
                start = float(item.get('start', 0))
                dur = float(item.get('dur', 0))
                text = item.get('text', '').strip()

                if text and text != '\n':
                    segments.append({
                        'start': start,
                        'end': start + dur,
                        'text': text
                    })

    # Sort by start time
    segments.sort(key=lambda x: x['start'])
    return segments

def extract_keywords(text):
    """Extract key topics from text."""
    keywords = {
        'Market & Community': ['market', 'pasar', 'grup', 'komunitas', 'telegram', 'facebook', 'forum'],
        'Problem Finding': ['masalah', 'problem', 'kesulitan', 'permasalahan', 'error', 'bermasalah'],
        'Exploration': ['explor', 'coba', 'trial', 'nyoba', 'eksplor', 'explore'],
        'Personal Branding': ['branding', 'personal branding', 'show off', 'image', 'eksistensi'],
        'AIDA/Funnel': ['aida', 'awareness', 'interest', 'desire', 'action', 'funel', 'funnel'],
        'Trust': ['trust', 'percaya', 'kepercayaan'],
        'Clients': ['klien', 'client', 'pelanggan', 'customer'],
        'Pricing': ['harga', 'price', 'bayar', 'budget', 'rp', 'juta', 'ribu', 'dibayar'],
        'Negotiation': ['tawar', 'negosiasi', 'deal'],
        'Services': ['jasa', 'service', 'website', 'plugin', 'elementor', 'instal'],
        'Cold/Warm/Hot Market': ['cold market', 'warm market', 'hot market', 'dingin', 'hangat'],
        'Network': ['network', 'jaringan', 'koneksi', 'hubungan'],
        'Sharing': ['sharing', 'share', 'bagi'],
        'Products': ['produk', 'product', 'template'],
        'Japri': ['japri', 'private', 'chat pribadi'],
    }

    found = []
    text_lower = text.lower()

    for topic, kw_list in keywords.items():
        count = sum(1 for kw in kw_list if kw.lower() in text_lower)
        if count > 0:
            found.append((topic, count))

    return sorted(found, key=lambda x: x[1], reverse=True)

def analyze_video():
    """Analyze the video transcript."""
    file_path = "/Users/dwindown/CascadeProjects/MeetDwindiCom/access-hub/Live Zoom - Diskusi Cara Jual Jasa via Online.json"

    print("="*80)
    print("VIDEO TRANSCRIPT ANALYSIS")
    print("Cara Jual Jasa via Online (How to Sell Services Online)")
    print("="*80)
    print()

    data = load_transcript(file_path)
    segments = extract_segments(data)

    print(f"Total segments: {len(segments)}")

    if not segments:
        print("No segments found!")
        return

    total_duration = segments[-1]['end']
    print(f"Total duration: {seconds_to_timestamp(total_duration)} ({total_duration/60:.1f} minutes)\n")

    # Create time-based groups every 5 minutes
    print("="*80)
    print("CONTENT BREAKDOWN BY 5-MINUTE INTERVALS")
    print("="*80)
    print()

    window = 300  # 5 minutes
    current_time = 0
    section_num = 1

    while current_time < total_duration:
        window_end = min(current_time + window, total_duration)
        window_segments = [s for s in segments
                          if current_time <= s['start'] < window_end]

        if window_segments:
            # Combine text
            combined_text = ' '.join([s['text'] for s in window_segments])

            # Extract keywords
            keywords = extract_keywords(combined_text)

            print(f"Section {section_num}: {seconds_to_timestamp(current_time)} - {seconds_to_timestamp(window_end)}")
            print("-" * 80)

            # Show first 400 characters as preview
            preview = combined_text[:400]
            print(f"Content: {preview}...")
            print()

            if keywords:
                print("Key topics detected:")
                for topic, count in keywords[:7]:
                    print(f"  • {topic}: {count} mentions")
            else:
                print("Key topics: (transition/break section)")

            print()
            print()

            section_num += 1

        current_time = window_end

    # Now create suggested chapters based on content analysis
    print("\n")
    print("="*80)
    print("SUGGESTED CHAPTER STRUCTURE")
    print("="*80)
    print()

    # Create larger 15-minute groups for chapter suggestions
    chapter_window = 900  # 15 minutes
    current_time = 0
    chapter_num = 1

    while current_time < total_duration:
        chapter_end = min(current_time + chapter_window, total_duration)
        chapter_segments = [s for s in segments
                           if current_time <= s['start'] < chapter_end]

        if chapter_segments:
            combined_text = ' '.join([s['text'] for s in chapter_segments])
            keywords = extract_keywords(combined_text)

            # Get top 3 keywords for chapter title
            main_topics = [kw[0] for kw in keywords[:3]]

            print(f"Chapter {chapter_num}: {seconds_to_timestamp(current_time)} - {seconds_to_timestamp(chapter_end)}")
            print(f"Main topics: {', '.join(main_topics)}")

            # Show first 300 chars
            preview = combined_text[:300].replace('\n', ' ')
            print(f"Preview: {preview}...")
            print()
            print()

            chapter_num += 1

        current_time = chapter_end

if __name__ == "__main__":
    analyze_video()