使用webhdfs


最近在研究腾讯的TBDS产品, 但是没有相关的hdfs sdk, 因此自已就开发了一个 此sdk兼容其它平台的hdfs认证(InsecureClient, TokenClient)

安装tbds-hdfs

pip install tbds-hdfs

使用方法

获取client
SecretId = 'xxxxxxx'
SecretKey = 'xxxxxxx'
url = 'http://xxxx:50070;http://xxxxx:50070'  #支持高可用写法

from hdfs import TbdsClient
client = TbdsClient(url='http://host:port', SecretId, SecretKey)
读取文件
# 一次性读取
with client.read('features') as reader:
  features = reader.read().decode()

# 直接读取JSON
with client.read('model.json', encoding='utf-8') as reader:
  from json import load
  model = load(reader)

# 流式读取
with client.read('features', chunk_size=8096) as reader:
  for chunk in reader:
    pass

#按行读取
with client.read('samples.csv', encoding='utf-8', delimiter='\n') as reader:
  for line in reader:
    pass

写入文件

# 按行写入数据
with open('samples') as reader, client.write('samples', overwrite=True) as writer:
  for line in reader:
      writer.write(line.encode('utf8'))

# 写入JSON文件
with client.write('model.json', encoding='utf-8') as writer:
  from json import dump
  dump(model, writer)
其它文件操作方法
# 查看文件内容
content = client.content('dat')

# 列出文件夹下所有文件
fnames = client.list('dat')

# 获取文件或文件夹状态
status = client.status('dat/features')

# 重命名(移动)文件
client.rename('dat/features', 'features')

# 删除文件/文件夹
client.delete('dat', recursive=True)