Improve ObjectEditor and Add TableEditor
This commit is contained in:
147
src/utils/browserCompat.js
Normal file
147
src/utils/browserCompat.js
Normal file
@@ -0,0 +1,147 @@
|
||||
// Browser compatibility utilities for handling different browser environments
|
||||
|
||||
/**
|
||||
* Detect if the app is running in Telegram's built-in browser
|
||||
*/
|
||||
export const isTelegramBrowser = () => {
|
||||
const userAgent = navigator.userAgent.toLowerCase();
|
||||
return userAgent.includes('telegram') ||
|
||||
userAgent.includes('tgios') ||
|
||||
userAgent.includes('tgandroid') ||
|
||||
// Check for Telegram-specific window properties
|
||||
(window.TelegramWebviewProxy !== undefined) ||
|
||||
// Check for common Telegram browser characteristics
|
||||
(userAgent.includes('mobile') && userAgent.includes('webkit') && !userAgent.includes('chrome'));
|
||||
};
|
||||
|
||||
/**
|
||||
* Detect if the app is running in any mobile in-app browser
|
||||
*/
|
||||
export const isInAppBrowser = () => {
|
||||
const userAgent = navigator.userAgent.toLowerCase();
|
||||
return userAgent.includes('wv') || // WebView
|
||||
userAgent.includes('telegram') ||
|
||||
userAgent.includes('fbav') || // Facebook
|
||||
userAgent.includes('fban') || // Facebook
|
||||
userAgent.includes('instagram') ||
|
||||
userAgent.includes('twitter') ||
|
||||
userAgent.includes('line') ||
|
||||
userAgent.includes('whatsapp');
|
||||
};
|
||||
|
||||
/**
|
||||
* Get browser information
|
||||
*/
|
||||
export const getBrowserInfo = () => {
|
||||
const userAgent = navigator.userAgent;
|
||||
return {
|
||||
userAgent,
|
||||
isTelegram: isTelegramBrowser(),
|
||||
isInApp: isInAppBrowser(),
|
||||
isMobile: /Android|webOS|iPhone|iPad|iPod|BlackBerry|IEMobile|Opera Mini/i.test(userAgent),
|
||||
isIOS: /iPad|iPhone|iPod/.test(userAgent),
|
||||
isAndroid: /Android/.test(userAgent)
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Add polyfills and compatibility fixes for problematic browsers
|
||||
*/
|
||||
export const addCompatibilityFixes = () => {
|
||||
// Fix for missing or problematic console methods in some browsers
|
||||
if (!window.console) {
|
||||
window.console = {
|
||||
log: () => {},
|
||||
error: () => {},
|
||||
warn: () => {},
|
||||
info: () => {},
|
||||
debug: () => {}
|
||||
};
|
||||
}
|
||||
|
||||
// Ensure console methods exist and are functions
|
||||
['log', 'error', 'warn', 'info', 'debug'].forEach(method => {
|
||||
if (typeof console[method] !== 'function') {
|
||||
console[method] = () => {};
|
||||
}
|
||||
});
|
||||
|
||||
// Add requestAnimationFrame polyfill if missing
|
||||
if (!window.requestAnimationFrame) {
|
||||
window.requestAnimationFrame = (callback) => {
|
||||
return setTimeout(callback, 1000 / 60);
|
||||
};
|
||||
}
|
||||
|
||||
// Add cancelAnimationFrame polyfill if missing
|
||||
if (!window.cancelAnimationFrame) {
|
||||
window.cancelAnimationFrame = (id) => {
|
||||
clearTimeout(id);
|
||||
};
|
||||
}
|
||||
|
||||
// Fix for missing or problematic localStorage in some browsers
|
||||
try {
|
||||
localStorage.setItem('test', 'test');
|
||||
localStorage.removeItem('test');
|
||||
} catch (e) {
|
||||
window.localStorage = {
|
||||
getItem: () => null,
|
||||
setItem: () => {},
|
||||
removeItem: () => {},
|
||||
clear: () => {},
|
||||
length: 0,
|
||||
key: () => null
|
||||
};
|
||||
}
|
||||
|
||||
// Fix for missing or problematic sessionStorage
|
||||
try {
|
||||
sessionStorage.setItem('test', 'test');
|
||||
sessionStorage.removeItem('test');
|
||||
} catch (e) {
|
||||
window.sessionStorage = {
|
||||
getItem: () => null,
|
||||
setItem: () => {},
|
||||
removeItem: () => {},
|
||||
clear: () => {},
|
||||
length: 0,
|
||||
key: () => null
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Initialize compatibility fixes
|
||||
*/
|
||||
export const initBrowserCompat = () => {
|
||||
const browserInfo = getBrowserInfo();
|
||||
|
||||
// Log browser info for debugging
|
||||
console.log('Browser Info:', browserInfo);
|
||||
|
||||
// Add compatibility fixes
|
||||
addCompatibilityFixes();
|
||||
|
||||
// Add specific fixes for Telegram browser
|
||||
if (browserInfo.isTelegram) {
|
||||
console.log('Telegram browser detected - applying compatibility fixes');
|
||||
|
||||
// Add Telegram-specific error handling
|
||||
window.addEventListener('error', (event) => {
|
||||
console.log('Global error caught in Telegram browser:', event.error);
|
||||
// Prevent the error from bubbling up and showing the error overlay
|
||||
event.preventDefault();
|
||||
return true;
|
||||
});
|
||||
|
||||
window.addEventListener('unhandledrejection', (event) => {
|
||||
console.log('Unhandled promise rejection in Telegram browser:', event.reason);
|
||||
// Prevent the error from bubbling up
|
||||
event.preventDefault();
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
return browserInfo;
|
||||
};
|
||||
371
src/utils/contentExtractor.js
Normal file
371
src/utils/contentExtractor.js
Normal file
@@ -0,0 +1,371 @@
|
||||
// Content extraction and article detection utilities
|
||||
|
||||
/**
|
||||
* Content classification types
|
||||
*/
|
||||
export const CONTENT_TYPES = {
|
||||
RICH_ARTICLE: 'rich_article',
|
||||
GENERAL_CONTENT: 'general_content',
|
||||
LIMITED_CONTENT: 'limited_content',
|
||||
NO_CONTENT: 'no_content'
|
||||
};
|
||||
|
||||
/**
|
||||
* Content type display information
|
||||
*/
|
||||
export const CONTENT_TYPE_INFO = {
|
||||
[CONTENT_TYPES.RICH_ARTICLE]: {
|
||||
label: 'Rich Article Content',
|
||||
emoji: '🟢',
|
||||
description: 'Clear article structure with headings and paragraphs',
|
||||
color: 'text-green-600 dark:text-green-400'
|
||||
},
|
||||
[CONTENT_TYPES.GENERAL_CONTENT]: {
|
||||
label: 'General Web Content',
|
||||
emoji: '🟡',
|
||||
description: 'Readable text mixed with navigation and UI elements',
|
||||
color: 'text-yellow-600 dark:text-yellow-400'
|
||||
},
|
||||
[CONTENT_TYPES.LIMITED_CONTENT]: {
|
||||
label: 'Limited Text Content',
|
||||
emoji: '🟠',
|
||||
description: 'Mostly UI/navigation with minimal readable text',
|
||||
color: 'text-orange-600 dark:text-orange-400'
|
||||
},
|
||||
[CONTENT_TYPES.NO_CONTENT]: {
|
||||
label: 'No Readable Content',
|
||||
emoji: '🔴',
|
||||
description: 'Images, videos, or heavily JavaScript-dependent content',
|
||||
color: 'text-red-600 dark:text-red-400'
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* CORS proxy services for fetching external content
|
||||
*/
|
||||
const CORS_PROXIES = [
|
||||
'https://api.allorigins.win/get?url=',
|
||||
'https://corsproxy.io/?',
|
||||
'https://cors-anywhere.herokuapp.com/',
|
||||
'https://thingproxy.freeboard.io/fetch/'
|
||||
];
|
||||
|
||||
/**
|
||||
* Fetch and parse HTML content from URL with CORS proxy fallback
|
||||
*/
|
||||
export const fetchUrlContent = async (url) => {
|
||||
try {
|
||||
// Validate URL
|
||||
const urlObj = new URL(url);
|
||||
if (!['http:', 'https:'].includes(urlObj.protocol)) {
|
||||
throw new Error('Only HTTP and HTTPS URLs are supported');
|
||||
}
|
||||
|
||||
// First try direct fetch (works for same-origin or CORS-enabled sites)
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; TextAnalyzer/1.0)'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const contentType = response.headers.get('content-type') || '';
|
||||
if (contentType.includes('text/html')) {
|
||||
const html = await response.text();
|
||||
return { html, url: response.url, contentType };
|
||||
}
|
||||
}
|
||||
} catch (directError) {
|
||||
console.log('Direct fetch failed, trying CORS proxy:', directError.message);
|
||||
}
|
||||
|
||||
// Try CORS proxies
|
||||
let lastError = null;
|
||||
|
||||
for (const proxy of CORS_PROXIES) {
|
||||
try {
|
||||
let proxyUrl;
|
||||
let response;
|
||||
|
||||
if (proxy.includes('allorigins.win')) {
|
||||
// AllOrigins returns JSON with contents
|
||||
proxyUrl = `${proxy}${encodeURIComponent(url)}`;
|
||||
response = await fetch(proxyUrl);
|
||||
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
if (data.contents) {
|
||||
return {
|
||||
html: data.contents,
|
||||
url: data.status.url || url,
|
||||
contentType: 'text/html'
|
||||
};
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Other proxies return HTML directly
|
||||
proxyUrl = `${proxy}${url}`;
|
||||
response = await fetch(proxyUrl, {
|
||||
headers: {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const contentType = response.headers.get('content-type') || 'text/html';
|
||||
if (contentType.includes('text/html') || contentType.includes('text/plain')) {
|
||||
const html = await response.text();
|
||||
return { html, url, contentType };
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (proxyError) {
|
||||
lastError = proxyError;
|
||||
console.log(`Proxy ${proxy} failed:`, proxyError.message);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`All fetch methods failed. Last error: ${lastError?.message || 'Unknown error'}`);
|
||||
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to fetch content: ${error.message}`);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Parse HTML and create DOM
|
||||
*/
|
||||
export const parseHtml = (html) => {
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(html, 'text/html');
|
||||
return doc;
|
||||
};
|
||||
|
||||
/**
|
||||
* Detect article elements and structure
|
||||
*/
|
||||
export const detectArticleStructure = (doc) => {
|
||||
const structure = {
|
||||
hasArticleTag: false,
|
||||
hasMainTag: false,
|
||||
headingCount: 0,
|
||||
paragraphCount: 0,
|
||||
hasMetaArticle: false,
|
||||
hasJsonLd: false,
|
||||
wordCount: 0,
|
||||
linkDensity: 0
|
||||
};
|
||||
|
||||
// Check for semantic HTML5 tags
|
||||
structure.hasArticleTag = doc.querySelector('article') !== null;
|
||||
structure.hasMainTag = doc.querySelector('main') !== null;
|
||||
|
||||
// Count headings
|
||||
structure.headingCount = doc.querySelectorAll('h1, h2, h3, h4, h5, h6').length;
|
||||
|
||||
// Count paragraphs
|
||||
structure.paragraphCount = doc.querySelectorAll('p').length;
|
||||
|
||||
// Check meta tags for articles
|
||||
const metaTags = doc.querySelectorAll('meta[property^="og:"], meta[name^="article:"]');
|
||||
structure.hasMetaArticle = Array.from(metaTags).some(meta =>
|
||||
meta.getAttribute('property') === 'og:type' && meta.getAttribute('content') === 'article' ||
|
||||
meta.getAttribute('name')?.startsWith('article:')
|
||||
);
|
||||
|
||||
// Check for JSON-LD structured data
|
||||
const jsonLdScripts = doc.querySelectorAll('script[type="application/ld+json"]');
|
||||
structure.hasJsonLd = Array.from(jsonLdScripts).some(script => {
|
||||
try {
|
||||
const data = JSON.parse(script.textContent);
|
||||
const type = data['@type'] || (Array.isArray(data) ? data[0]['@type'] : null);
|
||||
return type && ['Article', 'NewsArticle', 'BlogPosting'].includes(type);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
return structure;
|
||||
};
|
||||
|
||||
/**
|
||||
* Extract clean text from article elements
|
||||
*/
|
||||
export const extractArticleText = (doc) => {
|
||||
const articleSelectors = [
|
||||
'article',
|
||||
'main article',
|
||||
'[role="main"] article',
|
||||
'.article-content',
|
||||
'.post-content',
|
||||
'.entry-content',
|
||||
'.content-body'
|
||||
];
|
||||
|
||||
// Try to find article container
|
||||
let articleContainer = null;
|
||||
for (const selector of articleSelectors) {
|
||||
articleContainer = doc.querySelector(selector);
|
||||
if (articleContainer) break;
|
||||
}
|
||||
|
||||
// If no article container, try main content area
|
||||
if (!articleContainer) {
|
||||
const mainSelectors = ['main', '[role="main"]', '#main', '#content', '.main-content'];
|
||||
for (const selector of mainSelectors) {
|
||||
articleContainer = doc.querySelector(selector);
|
||||
if (articleContainer) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract text from container or full document
|
||||
const container = articleContainer || doc.body;
|
||||
|
||||
if (!container) return { text: '', elements: [] };
|
||||
|
||||
// Remove unwanted elements
|
||||
const unwantedSelectors = [
|
||||
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
||||
'.navigation', '.nav', '.menu', '.sidebar', '.ads', '.advertisement',
|
||||
'.social-share', '.comments', '.related-posts', '.author-bio'
|
||||
];
|
||||
|
||||
const clone = container.cloneNode(true);
|
||||
unwantedSelectors.forEach(selector => {
|
||||
clone.querySelectorAll(selector).forEach(el => el.remove());
|
||||
});
|
||||
|
||||
// Extract text from meaningful elements
|
||||
const meaningfulElements = clone.querySelectorAll('h1, h2, h3, h4, h5, h6, p, li, blockquote, pre');
|
||||
const elements = Array.from(meaningfulElements).map(el => ({
|
||||
tag: el.tagName.toLowerCase(),
|
||||
text: el.textContent.trim(),
|
||||
length: el.textContent.trim().length
|
||||
})).filter(el => el.length > 0);
|
||||
|
||||
const text = elements.map(el => el.text).join('\n\n');
|
||||
|
||||
return { text, elements };
|
||||
};
|
||||
|
||||
/**
|
||||
* Extract all visible text from page
|
||||
*/
|
||||
export const extractAllText = (doc) => {
|
||||
const clone = doc.body.cloneNode(true);
|
||||
|
||||
// Remove unwanted elements
|
||||
const unwantedSelectors = ['script', 'style', 'noscript'];
|
||||
unwantedSelectors.forEach(selector => {
|
||||
clone.querySelectorAll(selector).forEach(el => el.remove());
|
||||
});
|
||||
|
||||
const text = clone.textContent || clone.innerText || '';
|
||||
return text.replace(/\s+/g, ' ').trim();
|
||||
};
|
||||
|
||||
/**
|
||||
* Calculate content quality metrics
|
||||
*/
|
||||
export const calculateContentMetrics = (doc, articleText, allText) => {
|
||||
const metrics = {
|
||||
articleWordCount: articleText.split(/\s+/).filter(w => w.length > 0).length,
|
||||
totalWordCount: allText.split(/\s+/).filter(w => w.length > 0).length,
|
||||
contentRatio: 0,
|
||||
linkCount: doc.querySelectorAll('a[href]').length,
|
||||
imageCount: doc.querySelectorAll('img').length,
|
||||
headingCount: doc.querySelectorAll('h1, h2, h3, h4, h5, h6').length,
|
||||
paragraphCount: doc.querySelectorAll('p').length,
|
||||
linkDensity: 0
|
||||
};
|
||||
|
||||
if (metrics.totalWordCount > 0) {
|
||||
metrics.contentRatio = metrics.articleWordCount / metrics.totalWordCount;
|
||||
metrics.linkDensity = metrics.linkCount / metrics.totalWordCount;
|
||||
}
|
||||
|
||||
return metrics;
|
||||
};
|
||||
|
||||
/**
|
||||
* Classify content type based on structure and metrics
|
||||
*/
|
||||
export const classifyContent = (structure, metrics, articleText) => {
|
||||
const wordCount = metrics.articleWordCount;
|
||||
const contentRatio = metrics.contentRatio;
|
||||
const hasStructure = structure.hasArticleTag || structure.hasMainTag || structure.hasMetaArticle;
|
||||
const hasGoodStructure = structure.headingCount >= 2 && structure.paragraphCount >= 3;
|
||||
|
||||
// Rich Article Content
|
||||
if ((hasStructure || hasGoodStructure) && wordCount >= 300 && contentRatio > 0.6) {
|
||||
return CONTENT_TYPES.RICH_ARTICLE;
|
||||
}
|
||||
|
||||
// General Web Content
|
||||
if (wordCount >= 100 && contentRatio > 0.3) {
|
||||
return CONTENT_TYPES.GENERAL_CONTENT;
|
||||
}
|
||||
|
||||
// Limited Content
|
||||
if (wordCount >= 20) {
|
||||
return CONTENT_TYPES.LIMITED_CONTENT;
|
||||
}
|
||||
|
||||
// No readable content
|
||||
return CONTENT_TYPES.NO_CONTENT;
|
||||
};
|
||||
|
||||
/**
|
||||
* Main function to extract and analyze content from URL
|
||||
*/
|
||||
export const extractContentFromUrl = async (url) => {
|
||||
try {
|
||||
// Fetch content
|
||||
const { html, url: finalUrl, contentType } = await fetchUrlContent(url);
|
||||
|
||||
// Parse HTML
|
||||
const doc = parseHtml(html);
|
||||
|
||||
// Detect article structure
|
||||
const structure = detectArticleStructure(doc);
|
||||
|
||||
// Extract text content
|
||||
const { text: articleText, elements } = extractArticleText(doc);
|
||||
const allText = extractAllText(doc);
|
||||
|
||||
// Calculate metrics
|
||||
const metrics = calculateContentMetrics(doc, articleText, allText);
|
||||
|
||||
// Classify content
|
||||
const contentClassification = classifyContent(structure, metrics, articleText);
|
||||
|
||||
// Get page metadata
|
||||
const title = doc.querySelector('title')?.textContent?.trim() || '';
|
||||
const description = doc.querySelector('meta[name="description"]')?.getAttribute('content') || '';
|
||||
|
||||
return {
|
||||
success: true,
|
||||
url: finalUrl,
|
||||
title,
|
||||
description,
|
||||
contentType: contentClassification,
|
||||
structure,
|
||||
metrics,
|
||||
articleText,
|
||||
allText,
|
||||
elements,
|
||||
extractedAt: new Date().toISOString()
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
url
|
||||
};
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user