file.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. # coding=utf-8
  2. import base64
  3. import ipaddress
  4. import re
  5. import socket
  6. import urllib
  7. from urllib.parse import urlparse, urlunparse
  8. import requests
  9. import uuid_utils.compat as uuid
  10. from django.db.models import QuerySet
  11. from django.http import HttpResponse
  12. from django.utils.translation import gettext_lazy as _
  13. from rest_framework import serializers
  14. from application.models import Application
  15. from common.exception.app_exception import NotFound404, AppApiException
  16. from knowledge.models import File, FileSourceType
  17. from tools.serializers.tool import UploadedFileField
  18. mime_types = {
  19. "html": "text/html", "htm": "text/html", "shtml": "text/html", "css": "text/css", "xml": "text/xml",
  20. "gif": "image/gif", "jpeg": "image/jpeg", "jpg": "image/jpeg", "js": "application/javascript",
  21. "atom": "application/atom+xml", "rss": "application/rss+xml", "mml": "text/mathml", "txt": "text/plain",
  22. "jad": "text/vnd.sun.j2me.app-descriptor", "wml": "text/vnd.wap.wml", "htc": "text/x-component",
  23. "avif": "image/avif", "png": "image/png", "svg": "image/svg+xml", "svgz": "image/svg+xml",
  24. "tif": "image/tiff", "tiff": "image/tiff", "wbmp": "image/vnd.wap.wbmp", "webp": "image/webp",
  25. "ico": "image/x-icon", "jng": "image/x-jng", "bmp": "image/x-ms-bmp", "woff": "font/woff",
  26. "woff2": "font/woff2", "jar": "application/java-archive", "war": "application/java-archive",
  27. "ear": "application/java-archive", "json": "application/json", "hqx": "application/mac-binhex40",
  28. "doc": "application/msword", "pdf": "application/pdf", "ps": "application/postscript",
  29. "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
  30. "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  31. "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
  32. "eps": "application/postscript", "ai": "application/postscript", "rtf": "application/rtf",
  33. "m3u8": "application/vnd.apple.mpegurl", "kml": "application/vnd.google-earth.kml+xml",
  34. "kmz": "application/vnd.google-earth.kmz", "xls": "application/vnd.ms-excel",
  35. "eot": "application/vnd.ms-fontobject", "ppt": "application/vnd.ms-powerpoint",
  36. "odg": "application/vnd.oasis.opendocument.graphics",
  37. "odp": "application/vnd.oasis.opendocument.presentation",
  38. "ods": "application/vnd.oasis.opendocument.spreadsheet", "odt": "application/vnd.oasis.opendocument.text",
  39. "wmlc": "application/vnd.wap.wmlc", "wasm": "application/wasm", "7z": "application/x-7z-compressed",
  40. "cco": "application/x-cocoa", "jardiff": "application/x-java-archive-diff",
  41. "jnlp": "application/x-java-jnlp-file", "run": "application/x-makeself", "pl": "application/x-perl",
  42. "pm": "application/x-perl", "prc": "application/x-pilot", "pdb": "application/x-pilot",
  43. "rar": "application/x-rar-compressed", "rpm": "application/x-redhat-package-manager",
  44. "sea": "application/x-sea", "swf": "application/x-shockwave-flash", "sit": "application/x-stuffit",
  45. "tcl": "application/x-tcl", "tk": "application/x-tcl", "der": "application/x-x509-ca-cert",
  46. "pem": "application/x-x509-ca-cert", "crt": "application/x-x509-ca-cert",
  47. "xpi": "application/x-xpinstall", "xhtml": "application/xhtml+xml", "xspf": "application/xspf+xml",
  48. "zip": "application/zip", "bin": "application/octet-stream", "exe": "application/octet-stream",
  49. "dll": "application/octet-stream", "deb": "application/octet-stream", "dmg": "application/octet-stream",
  50. "iso": "application/octet-stream", "img": "application/octet-stream", "msi": "application/octet-stream",
  51. "msp": "application/octet-stream", "msm": "application/octet-stream", "mid": "audio/midi",
  52. "midi": "audio/midi", "kar": "audio/midi", "mp3": "audio/mp3", "ogg": "audio/ogg", "m4a": "audio/x-m4a",
  53. "ra": "audio/x-realaudio", "3gpp": "video/3gpp", "3gp": "video/3gpp", "ts": "video/mp2t",
  54. "mp4": "video/mp4", "mpeg": "video/mpeg", "mpg": "video/mpeg", "mov": "video/quicktime",
  55. "webm": "video/webm", "flv": "video/x-flv", "m4v": "video/x-m4v", "mng": "video/x-mng",
  56. "asx": "video/x-ms-asf", "asf": "video/x-ms-asf", "wmv": "video/x-ms-wmv", "avi": "video/x-msvideo",
  57. "wav": "audio/wav", "flac": "audio/flac", "aac": "audio/aac", "opus": "audio/opus",
  58. "csv": "text/csv", "tsv": "text/tab-separated-values", "ics": "text/calendar",
  59. }
  60. # 如果是音频文件并且有range请求,处理部分内容
  61. audio_types = ['mp3', 'wav', 'ogg', 'flac', 'aac', 'opus', 'm4a']
  62. class FileSerializer(serializers.Serializer):
  63. file = UploadedFileField(required=True, label=_('file'))
  64. meta = serializers.JSONField(required=False, allow_null=True)
  65. source_id = serializers.CharField(
  66. required=False, allow_null=True, label=_('source id'), default=FileSourceType.TEMPORARY_120_MINUTE.value
  67. )
  68. source_type = serializers.ChoiceField(
  69. choices=FileSourceType.choices, required=False, allow_null=True, label=_('source type'),
  70. default=FileSourceType.TEMPORARY_120_MINUTE
  71. )
  72. def upload(self, with_valid=True):
  73. if with_valid:
  74. self.is_valid(raise_exception=True)
  75. meta = self.data.get('meta', None)
  76. if not meta:
  77. meta = {'debug': True}
  78. file_id = meta.get('file_id', uuid.uuid7())
  79. file = File(
  80. id=file_id,
  81. file_name=self.data.get('file').name,
  82. meta=meta,
  83. source_id=self.data.get('source_id') or FileSourceType.TEMPORARY_120_MINUTE.value,
  84. source_type=self.data.get('source_type') or FileSourceType.TEMPORARY_120_MINUTE
  85. )
  86. file.save(self.data.get('file').read())
  87. return f'./oss/file/{file_id}'
  88. class Operate(serializers.Serializer):
  89. id = serializers.UUIDField(required=True)
  90. http_range = serializers.CharField(
  91. required=False, allow_blank=True, allow_null=True, label=_('HTTP Range'),
  92. help_text=_('HTTP Range header for partial content requests, e.g., "bytes=0-1023"')
  93. )
  94. def get(self, with_valid=True):
  95. if with_valid:
  96. self.is_valid(raise_exception=True)
  97. file_id = self.data.get('id')
  98. file = QuerySet(File).filter(id=file_id).first()
  99. if file is None:
  100. raise NotFound404(404, _('File not found'))
  101. file_type = file.file_name.split(".")[-1].lower()
  102. content_type = mime_types.get(file_type, 'application/octet-stream')
  103. encoded_filename = urllib.parse.quote(file.file_name)
  104. # 获取文件内容
  105. file_bytes = file.get_bytes()
  106. file_size = len(file_bytes)
  107. response = None
  108. if file_type in audio_types and self.data.get('http_range'):
  109. response = self.handle_audio(file_size, file_bytes, content_type, encoded_filename)
  110. if response:
  111. return response
  112. # 对于非范围请求或其他类型文件,返回完整内容
  113. headers = {
  114. 'Content-Type': content_type,
  115. 'Content-Disposition': f'{"inline" if file_type == "pdf" else "attachment"}; filename={encoded_filename}'
  116. }
  117. return HttpResponse(
  118. file_bytes,
  119. status=200,
  120. headers=headers
  121. )
  122. def handle_audio(self, file_size, file_bytes, content_type, encoded_filename):
  123. # 解析range请求 (格式如 "bytes=0-1023")
  124. range_match = re.match(r'bytes=(\d+)-(\d*)', self.data.get('http_range', ''))
  125. if range_match:
  126. start = int(range_match.group(1))
  127. end = int(range_match.group(2)) if range_match.group(2) else file_size - 1
  128. # 确保范围合法
  129. end = min(end, file_size - 1)
  130. length = end - start + 1
  131. # 创建部分响应
  132. response = HttpResponse(
  133. file_bytes[start:start + length],
  134. status=206,
  135. content_type=content_type
  136. )
  137. # 设置部分内容响应头
  138. response['Content-Range'] = f'bytes {start}-{end}/{file_size}'
  139. response['Accept-Ranges'] = 'bytes'
  140. response['Content-Length'] = str(length)
  141. response['Content-Disposition'] = f'inline; filename={encoded_filename}'
  142. return response
  143. def delete(self):
  144. self.is_valid(raise_exception=True)
  145. file_id = self.data.get('id')
  146. file = QuerySet(File).filter(id=file_id).first()
  147. if file is not None:
  148. file.delete()
  149. return True
  150. from requests.adapters import HTTPAdapter
  151. class SafeHTTPAdapter(HTTPAdapter):
  152. """
  153. 安全的 HTTP 适配器,防止 DNS 重绑定攻击
  154. 在建立连接前验证目标 IP 地址
  155. """
  156. def send(self, request, **kwargs):
  157. # 解析 URL 获取主机名
  158. parsed_url = urlparse(request.url)
  159. host = parsed_url.hostname
  160. if host:
  161. # 验证目标 IP 是否安全
  162. self._validate_host_ip(host)
  163. return super().send(request, **kwargs)
  164. def _validate_host_ip(self, host: str):
  165. """验证主机解析的 IP 地址是否安全"""
  166. try:
  167. # 获取所有 IP 地址(包括 IPv4 和 IPv6)
  168. addr_infos = socket.getaddrinfo(host, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
  169. for addr_info in addr_infos:
  170. ip = addr_info[4][0]
  171. if self._is_unsafe_ip(ip):
  172. raise AppApiException(500, _('Access to internal IP addresses is blocked'))
  173. except AppApiException:
  174. raise
  175. except Exception as e:
  176. raise AppApiException(500, _('Failed to resolve host: {error}').format(error=str(e)))
  177. def _is_unsafe_ip(self, ip: str) -> bool:
  178. """检查 IP 地址是否属于不安全的范围"""
  179. try:
  180. ip_addr = ipaddress.ip_address(ip)
  181. return (
  182. ip_addr.is_private or
  183. ip_addr.is_loopback or
  184. ip_addr.is_reserved or
  185. ip_addr.is_link_local or
  186. ip_addr.is_multicast
  187. )
  188. except Exception:
  189. return True
  190. def get_url_content(url, application_id: str):
  191. application = Application.objects.filter(id=application_id).first()
  192. if application is None:
  193. return AppApiException(500, _('Application does not exist'))
  194. if not application.file_upload_enable:
  195. return AppApiException(500, _('File upload is not enabled'))
  196. file_limit = 50 * 1024 * 1024
  197. if application.file_upload_setting and application.file_upload_setting.get('fileLimit'):
  198. file_limit = application.file_upload_setting.get('fileLimit') * 1024 * 1024
  199. parsed = validate_url(url)
  200. # 创建带有安全检查的 session
  201. session = requests.Session()
  202. safe_adapter = SafeHTTPAdapter()
  203. session.mount('http://', safe_adapter)
  204. session.mount('https://', safe_adapter)
  205. try:
  206. response = session.get(
  207. url,
  208. timeout=3,
  209. allow_redirects=False
  210. )
  211. finally:
  212. session.close()
  213. final_host = urlparse(response.url).hostname
  214. if is_private_ip(final_host):
  215. raise ValueError("Blocked unsafe redirect to internal host")
  216. # 判断文件大小
  217. if int(response.headers.get('Content-Length', 0)) > file_limit:
  218. raise AppApiException(500, _('File size exceeds limit'))
  219. # 返回状态码 响应内容大小 响应的contenttype 还有字节流
  220. content_type = response.headers.get('Content-Type', '')
  221. # 根据内容类型决定如何处理
  222. if 'text' in content_type or 'json' in content_type:
  223. content = response.text
  224. else:
  225. # 二进制内容使用Base64编码
  226. content = base64.b64encode(response.content).decode('utf-8')
  227. return {
  228. 'status_code': response.status_code,
  229. 'Content-Length': response.headers.get('Content-Length', 0),
  230. 'Content-Type': content_type,
  231. 'content': content,
  232. }
  233. def is_private_ip(host: str) -> bool:
  234. """检测 IP 是否属于内网、环回、云 metadata 的危险地址"""
  235. try:
  236. ip = ipaddress.ip_address(socket.gethostbyname(host))
  237. return (
  238. ip.is_private or
  239. ip.is_loopback or
  240. ip.is_reserved or
  241. ip.is_link_local or
  242. ip.is_multicast
  243. )
  244. except Exception:
  245. return True
  246. def validate_and_normalize_url(url: str) -> str:
  247. """
  248. 严格验证并规范化 URL,防止 URL 解析绕过攻击
  249. 防御场景:
  250. - http://127.0.0.1:6666\@1.1.1.1/ (反斜杠绕过)
  251. - http://127.0.0.1:6666@1.1.1.1/ (认证信息混淆)
  252. - http://1.1.1.1#@127.0.0.1:6666/ (片段注入)
  253. """
  254. if not url:
  255. raise ValueError("URL is required")
  256. # 1. 拒绝包含危险字符的 URL
  257. dangerous_patterns = [
  258. r'\\', # 反斜杠
  259. r'\s', # 空白字符
  260. r'%00', # 空字节
  261. r'%0a', # 换行符
  262. r'%0d', # 回车符
  263. ]
  264. url_lower = url.lower()
  265. for pattern in dangerous_patterns:
  266. if re.search(pattern, url_lower):
  267. raise ValueError("URL contains dangerous characters")
  268. # 2. 解析 URL
  269. parsed = urlparse(url)
  270. # 3. 仅允许 http / https
  271. if parsed.scheme not in ("http", "https"):
  272. raise ValueError("Only http and https are allowed")
  273. # 4. 提取主机名(从 netloc 中)
  274. netloc = parsed.netloc
  275. # 5. 如果 netloc 中包含 @,说明有认证信息,需要特别处理
  276. if '@' in netloc:
  277. # 分离认证信息和主机
  278. auth_part, host_part = netloc.rsplit('@', 1)
  279. # 检查认证部分是否包含危险的 IP 或端口信息
  280. # 攻击者可能在认证部分放置内网地址
  281. if ':' in auth_part or '.' in auth_part:
  282. raise ValueError("Authentication part contains suspicious content")
  283. # 使用真实的主机部分
  284. actual_host = host_part.split(':')[0] if ':' in host_part else host_part
  285. else:
  286. # 没有认证信息,直接提取主机
  287. actual_host = parsed.hostname
  288. # 6. 验证主机名不为空
  289. if not actual_host:
  290. raise ValueError("Invalid URL: missing hostname")
  291. # 7. 验证主机不是 IP 地址形式的内网地址
  292. # 这样可以防止直接在 URL 中使用内网 IP
  293. try:
  294. # 尝试解析为 IP 地址
  295. ip_addr = ipaddress.ip_address(actual_host)
  296. if is_private_ip(actual_host):
  297. raise ValueError("Access to internal IP addresses is blocked")
  298. except ValueError as e:
  299. # 如果不是 IP 地址(是域名),则继续检查
  300. if "internal IP" in str(e):
  301. raise
  302. # 对于域名,检查其解析结果
  303. if is_private_ip(actual_host):
  304. raise ValueError("Access to internal IP addresses is blocked")
  305. # 8. 重新构建干净的 URL,移除可能的认证信息
  306. clean_netloc = actual_host
  307. if parsed.port:
  308. clean_netloc = f"{actual_host}:{parsed.port}"
  309. clean_url = urlunparse((
  310. parsed.scheme,
  311. clean_netloc,
  312. parsed.path,
  313. parsed.params,
  314. parsed.query,
  315. '' # 移除 fragment,防止片段注入
  316. ))
  317. return clean_url
  318. def validate_url(url: str):
  319. """验证 URL 是否安全(保留向后兼容)"""
  320. return validate_and_normalize_url(url)